diff mbox series

[10/12] mtd: rawnand: brcmnand: Add BCMBCA read data bus interface

Message ID 20230606231252.94838-11-william.zhang@broadcom.com (mailing list archive)
State New, archived
Headers show
Series mtd: rawnand: brcmnand: driver and doc updates | expand

Commit Message

William Zhang June 6, 2023, 11:12 p.m. UTC
The BCMBCA broadband SoC integrates the NAND controller differently than
STB, iProc and other SoCs.  It has different endianness for NAND cache
data and ONFI parameter data.

Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
and performance improvement using the optimized memcpy function on NAND
cache memory.

Signed-off-by: William Zhang <william.zhang@broadcom.com>
---

 drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
 drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
 drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
 3 files changed, 68 insertions(+), 14 deletions(-)

Comments

Miquel Raynal June 7, 2023, 8:20 a.m. UTC | #1
Hi William,

william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:

> The BCMBCA broadband SoC integrates the NAND controller differently than
> STB, iProc and other SoCs.  It has different endianness for NAND cache
> data and ONFI parameter data.
> 
> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
> and performance improvement using the optimized memcpy function on NAND
> cache memory.
> 
> Signed-off-by: William Zhang <william.zhang@broadcom.com>
> ---
> 
>  drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
>  drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
>  drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
>  3 files changed, 68 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> index 7e48b6a0bfa2..899103a62c98 100644
> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> @@ -26,6 +26,18 @@ enum {
>  	BCMBCA_CTLRDY		= BIT(4),
>  };
>  
> +#if defined(CONFIG_ARM64)
> +#define ALIGN_REQ		8
> +#else
> +#define ALIGN_REQ		4
> +#endif
> +
> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
> +{
> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
> +}
> +
>  static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
>  {
>  	struct bcmbca_nand_soc *priv =
> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
>  	brcmnand_writel(val, mmio);
>  }
>  
> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,
> +				 void __iomem *flash_cache,  u32 *buffer,
> +				 int fc_words, bool is_param)
> +{
> +	int i;
> +
> +	if (!is_param) {
> +		/*
> +		 * memcpy can do unaligned aligned access depending on source
> +		 * and dest address, which is incompatible with nand cache. Fallback
> +		 * to the memcpy for io version
> +		 */
> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
> +		else
> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
> +	} else {
> +		/* Flash cache has same endian as the host for parameter pages */
> +		for (i = 0; i < fc_words; i++, buffer++)
> +			*buffer = __raw_readl(flash_cache + i * 4);
> +	}
> +}
> +
>  static int bcmbca_nand_probe(struct platform_device *pdev)
>  {
>  	struct device *dev = &pdev->dev;
> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)
>  
>  	soc->ctlrdy_ack = bcmbca_nand_intc_ack;
>  	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
> +	soc->read_data_bus = bcmbca_read_data_bus;
>  
>  	return brcmnand_probe(pdev, soc);
>  }
> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> index d920e88c7f5b..656be4d73016 100644
> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
>  	return brcmnand_readl(ctrl->edu_base + offs);
>  }
>  
> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,
> +					   void __iomem *flash_cache, u32 *buffer,
> +					   int fc_words, bool is_param)

I strongly dislike this "is_param" boolean.

When is the data in host endianness? When is it not?

If we think about an exec_op() conversion and drop cmdfunc(), what
would be the discriminant?

> +{
> +	struct brcmnand_soc *soc = ctrl->soc;
> +	int i;
> +
> +	if (soc->read_data_bus) {
> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
> +	} else {
> +		if (!is_param) {
> +			for (i = 0; i < fc_words; i++, buffer++)
> +				*buffer = brcmnand_read_fc(ctrl, i);
> +		} else {
> +			for (i = 0; i < fc_words; i++)
> +				/*
> +				 * Flash cache is big endian for parameter pages, at
> +				 * least on STB SoCs
> +				 */
> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> +		}
> +	}
> +}
> +
>  static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
>  {
>  
> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,
>  			native_cmd == CMD_PARAMETER_CHANGE_COL) {
>  		/* Copy flash cache word-wise */
>  		u32 *flash_cache = (u32 *)ctrl->flash_cache;
> -		int i;
>  
>  		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
>  
> -		/*
> -		 * Must cache the FLASH_CACHE now, since changes in
> -		 * SECTOR_SIZE_1K may invalidate it
> -		 */
> -		for (i = 0; i < FC_WORDS; i++)
> -			/*
> -			 * Flash cache is big endian for parameter pages, at
> -			 * least on STB SoCs
> -			 */
> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
> +				   FC_WORDS, true);
>  
>  		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
>  
> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>  {
>  	struct brcmnand_host *host = nand_get_controller_data(chip);
>  	struct brcmnand_controller *ctrl = host->ctrl;
> -	int i, j, ret = 0;
> +	int i, ret = 0;
>  
>  	brcmnand_clear_ecc_addr(ctrl);
>  
> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>  		if (likely(buf)) {
>  			brcmnand_soc_data_bus_prepare(ctrl->soc, false);
>  
> -			for (j = 0; j < FC_WORDS; j++, buf++)
> -				*buf = brcmnand_read_fc(ctrl, j);
> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
> +					FC_WORDS, false);
> +			buf += FC_WORDS;
>  
>  			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);
>  		}
> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> index f1f93d85f50d..88819bc395f8 100644
> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> @@ -24,6 +24,8 @@ struct brcmnand_soc {
>  	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
>  	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
>  				 bool is_param);
> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
> +				 u32 *buffer, int fc_words, bool is_param);
>  	const struct brcmnand_io_ops *ops;
>  };
>  


Thanks,
Miquèl
Miquel Raynal June 7, 2023, 8:22 a.m. UTC | #2
Hi William,

william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:

> The BCMBCA broadband SoC integrates the NAND controller differently than
> STB, iProc and other SoCs.  It has different endianness for NAND cache
> data and ONFI parameter data.
> 
> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
> and performance improvement using the optimized memcpy function on NAND
> cache memory.
> 
> Signed-off-by: William Zhang <william.zhang@broadcom.com>
> ---
> 
>  drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
>  drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
>  drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
>  3 files changed, 68 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> index 7e48b6a0bfa2..899103a62c98 100644
> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> @@ -26,6 +26,18 @@ enum {
>  	BCMBCA_CTLRDY		= BIT(4),
>  };
>  
> +#if defined(CONFIG_ARM64)
> +#define ALIGN_REQ		8
> +#else
> +#define ALIGN_REQ		4
> +#endif
> +
> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
> +{
> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
> +}
> +
>  static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
>  {
>  	struct bcmbca_nand_soc *priv =
> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
>  	brcmnand_writel(val, mmio);
>  }
>  
> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,
> +				 void __iomem *flash_cache,  u32 *buffer,
> +				 int fc_words, bool is_param)
> +{
> +	int i;
> +
> +	if (!is_param) {
> +		/*
> +		 * memcpy can do unaligned aligned access depending on source
> +		 * and dest address, which is incompatible with nand cache. Fallback
> +		 * to the memcpy for io version
> +		 */
> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
> +		else
> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
> +	} else {
> +		/* Flash cache has same endian as the host for parameter pages */
> +		for (i = 0; i < fc_words; i++, buffer++)
> +			*buffer = __raw_readl(flash_cache + i * 4);
> +	}
> +}
> +
>  static int bcmbca_nand_probe(struct platform_device *pdev)
>  {
>  	struct device *dev = &pdev->dev;
> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)
>  
>  	soc->ctlrdy_ack = bcmbca_nand_intc_ack;
>  	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
> +	soc->read_data_bus = bcmbca_read_data_bus;
>  
>  	return brcmnand_probe(pdev, soc);
>  }
> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> index d920e88c7f5b..656be4d73016 100644
> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
>  	return brcmnand_readl(ctrl->edu_base + offs);
>  }
>  
> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,
> +					   void __iomem *flash_cache, u32 *buffer,
> +					   int fc_words, bool is_param)
> +{
> +	struct brcmnand_soc *soc = ctrl->soc;
> +	int i;
> +
> +	if (soc->read_data_bus) {
> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
> +	} else {
> +		if (!is_param) {
> +			for (i = 0; i < fc_words; i++, buffer++)
> +				*buffer = brcmnand_read_fc(ctrl, i);
> +		} else {
> +			for (i = 0; i < fc_words; i++)
> +				/*
> +				 * Flash cache is big endian for parameter pages, at
> +				 * least on STB SoCs
> +				 */
> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> +		}
> +	}

Perhaps we could have a single function that is statically assigned at
probe time instead of a first helper with two conditions which calls in
one case another hook... This can be simplified I guess.

> +}
> +
>  static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
>  {
>  
> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,
>  			native_cmd == CMD_PARAMETER_CHANGE_COL) {
>  		/* Copy flash cache word-wise */
>  		u32 *flash_cache = (u32 *)ctrl->flash_cache;
> -		int i;
>  
>  		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
>  
> -		/*
> -		 * Must cache the FLASH_CACHE now, since changes in
> -		 * SECTOR_SIZE_1K may invalidate it
> -		 */
> -		for (i = 0; i < FC_WORDS; i++)
> -			/*
> -			 * Flash cache is big endian for parameter pages, at
> -			 * least on STB SoCs
> -			 */
> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
> +				   FC_WORDS, true);
>  
>  		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
>  
> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>  {
>  	struct brcmnand_host *host = nand_get_controller_data(chip);
>  	struct brcmnand_controller *ctrl = host->ctrl;
> -	int i, j, ret = 0;
> +	int i, ret = 0;
>  
>  	brcmnand_clear_ecc_addr(ctrl);
>  
> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>  		if (likely(buf)) {
>  			brcmnand_soc_data_bus_prepare(ctrl->soc, false);
>  
> -			for (j = 0; j < FC_WORDS; j++, buf++)
> -				*buf = brcmnand_read_fc(ctrl, j);
> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
> +					FC_WORDS, false);
> +			buf += FC_WORDS;
>  
>  			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);
>  		}
> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> index f1f93d85f50d..88819bc395f8 100644
> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> @@ -24,6 +24,8 @@ struct brcmnand_soc {
>  	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
>  	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
>  				 bool is_param);
> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
> +				 u32 *buffer, int fc_words, bool is_param);
>  	const struct brcmnand_io_ops *ops;
>  };
>  


Thanks,
Miquèl
William Zhang June 7, 2023, 8:12 p.m. UTC | #3
Hi Miquel,

On 06/07/2023 01:20 AM, Miquel Raynal wrote:
> Hi William,
> 
> william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:
> 
>> The BCMBCA broadband SoC integrates the NAND controller differently than
>> STB, iProc and other SoCs.  It has different endianness for NAND cache
>> data and ONFI parameter data.
>>
>> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
>> and performance improvement using the optimized memcpy function on NAND
>> cache memory.
>>
>> Signed-off-by: William Zhang <william.zhang@broadcom.com>
>> ---
>>
>>   drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
>>   drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
>>   drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
>>   3 files changed, 68 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>> index 7e48b6a0bfa2..899103a62c98 100644
>> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>> @@ -26,6 +26,18 @@ enum {
>>   	BCMBCA_CTLRDY		= BIT(4),
>>   };
>>   
>> +#if defined(CONFIG_ARM64)
>> +#define ALIGN_REQ		8
>> +#else
>> +#define ALIGN_REQ		4
>> +#endif
>> +
>> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
>> +{
>> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
>> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
>> +}
>> +
>>   static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
>>   {
>>   	struct bcmbca_nand_soc *priv =
>> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
>>   	brcmnand_writel(val, mmio);
>>   }
>>   
>> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,
>> +				 void __iomem *flash_cache,  u32 *buffer,
>> +				 int fc_words, bool is_param)
>> +{
>> +	int i;
>> +
>> +	if (!is_param) {
>> +		/*
>> +		 * memcpy can do unaligned aligned access depending on source
>> +		 * and dest address, which is incompatible with nand cache. Fallback
>> +		 * to the memcpy for io version
>> +		 */
>> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
>> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
>> +		else
>> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
>> +	} else {
>> +		/* Flash cache has same endian as the host for parameter pages */
>> +		for (i = 0; i < fc_words; i++, buffer++)
>> +			*buffer = __raw_readl(flash_cache + i * 4);
>> +	}
>> +}
>> +
>>   static int bcmbca_nand_probe(struct platform_device *pdev)
>>   {
>>   	struct device *dev = &pdev->dev;
>> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)
>>   
>>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;
>>   	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
>> +	soc->read_data_bus = bcmbca_read_data_bus;
>>   
>>   	return brcmnand_probe(pdev, soc);
>>   }
>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>> index d920e88c7f5b..656be4d73016 100644
>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
>>   	return brcmnand_readl(ctrl->edu_base + offs);
>>   }
>>   
>> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,
>> +					   void __iomem *flash_cache, u32 *buffer,
>> +					   int fc_words, bool is_param)
> 
> I strongly dislike this "is_param" boolean.
> 
> When is the data in host endianness? When is it not?
This is little bit complicated.  We have two type data read from nand 
cache. One for page read and the other for parameter and onfi data read 
from the controller side. But it depends on how SoC integrate the nand 
cache to system. In broadband SoC, both page and parameter data are in 
host endianess but other SoCs is not the same.

I am open to suggestion for is_param function argument but to factor out 
this common code in more structured way, I don't see other way around.

> 
> If we think about an exec_op() conversion and drop cmdfunc(), what
> would be the discriminant?
> 
If we need to implement exec_op in the future,  the data is not coming 
from nand cache but some other low level data register which may not 
subject to the endianess issue.

>> +{
>> +	struct brcmnand_soc *soc = ctrl->soc;
>> +	int i;
>> +
>> +	if (soc->read_data_bus) {
>> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
>> +	} else {
>> +		if (!is_param) {
>> +			for (i = 0; i < fc_words; i++, buffer++)
>> +				*buffer = brcmnand_read_fc(ctrl, i);
>> +		} else {
>> +			for (i = 0; i < fc_words; i++)
>> +				/*
>> +				 * Flash cache is big endian for parameter pages, at
>> +				 * least on STB SoCs
>> +				 */
>> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>> +		}
>> +	}
>> +}
>> +
>>   static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
>>   {
>>   
>> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,
>>   			native_cmd == CMD_PARAMETER_CHANGE_COL) {
>>   		/* Copy flash cache word-wise */
>>   		u32 *flash_cache = (u32 *)ctrl->flash_cache;
>> -		int i;
>>   
>>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
>>   
>> -		/*
>> -		 * Must cache the FLASH_CACHE now, since changes in
>> -		 * SECTOR_SIZE_1K may invalidate it
>> -		 */
>> -		for (i = 0; i < FC_WORDS; i++)
>> -			/*
>> -			 * Flash cache is big endian for parameter pages, at
>> -			 * least on STB SoCs
>> -			 */
>> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
>> +				   FC_WORDS, true);
>>   
>>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
>>   
>> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>   {
>>   	struct brcmnand_host *host = nand_get_controller_data(chip);
>>   	struct brcmnand_controller *ctrl = host->ctrl;
>> -	int i, j, ret = 0;
>> +	int i, ret = 0;
>>   
>>   	brcmnand_clear_ecc_addr(ctrl);
>>   
>> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>   		if (likely(buf)) {
>>   			brcmnand_soc_data_bus_prepare(ctrl->soc, false);
>>   
>> -			for (j = 0; j < FC_WORDS; j++, buf++)
>> -				*buf = brcmnand_read_fc(ctrl, j);
>> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
>> +					FC_WORDS, false);
>> +			buf += FC_WORDS;
>>   
>>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);
>>   		}
>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>> index f1f93d85f50d..88819bc395f8 100644
>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>> @@ -24,6 +24,8 @@ struct brcmnand_soc {
>>   	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
>>   	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
>>   				 bool is_param);
>> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
>> +				 u32 *buffer, int fc_words, bool is_param);
>>   	const struct brcmnand_io_ops *ops;
>>   };
>>   
> 
> 
> Thanks,
> Miquèl
>
William Zhang June 7, 2023, 8:24 p.m. UTC | #4
Hi Miquel,

On 06/07/2023 01:22 AM, Miquel Raynal wrote:
> Hi William,
> 
> william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:
> 
>> The BCMBCA broadband SoC integrates the NAND controller differently than
>> STB, iProc and other SoCs.  It has different endianness for NAND cache
>> data and ONFI parameter data.
>>
>> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
>> and performance improvement using the optimized memcpy function on NAND
>> cache memory.
>>
>> Signed-off-by: William Zhang <william.zhang@broadcom.com>
>> ---
>>
>>   drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
>>   drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
>>   drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
>>   3 files changed, 68 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>> index 7e48b6a0bfa2..899103a62c98 100644
>> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>> @@ -26,6 +26,18 @@ enum {
>>   	BCMBCA_CTLRDY		= BIT(4),
>>   };
>>   
>> +#if defined(CONFIG_ARM64)
>> +#define ALIGN_REQ		8
>> +#else
>> +#define ALIGN_REQ		4
>> +#endif
>> +
>> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
>> +{
>> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
>> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
>> +}
>> +
>>   static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
>>   {
>>   	struct bcmbca_nand_soc *priv =
>> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
>>   	brcmnand_writel(val, mmio);
>>   }
>>   
>> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,
>> +				 void __iomem *flash_cache,  u32 *buffer,
>> +				 int fc_words, bool is_param)
>> +{
>> +	int i;
>> +
>> +	if (!is_param) {
>> +		/*
>> +		 * memcpy can do unaligned aligned access depending on source
>> +		 * and dest address, which is incompatible with nand cache. Fallback
>> +		 * to the memcpy for io version
>> +		 */
>> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
>> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
>> +		else
>> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
>> +	} else {
>> +		/* Flash cache has same endian as the host for parameter pages */
>> +		for (i = 0; i < fc_words; i++, buffer++)
>> +			*buffer = __raw_readl(flash_cache + i * 4);
>> +	}
>> +}
>> +
>>   static int bcmbca_nand_probe(struct platform_device *pdev)
>>   {
>>   	struct device *dev = &pdev->dev;
>> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)
>>   
>>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;
>>   	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
>> +	soc->read_data_bus = bcmbca_read_data_bus;
>>   
>>   	return brcmnand_probe(pdev, soc);
>>   }
>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>> index d920e88c7f5b..656be4d73016 100644
>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
>>   	return brcmnand_readl(ctrl->edu_base + offs);
>>   }
>>   
>> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,
>> +					   void __iomem *flash_cache, u32 *buffer,
>> +					   int fc_words, bool is_param)
>> +{
>> +	struct brcmnand_soc *soc = ctrl->soc;
>> +	int i;
>> +
>> +	if (soc->read_data_bus) {
>> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
>> +	} else {
>> +		if (!is_param) {
>> +			for (i = 0; i < fc_words; i++, buffer++)
>> +				*buffer = brcmnand_read_fc(ctrl, i);
>> +		} else {
>> +			for (i = 0; i < fc_words; i++)
>> +				/*
>> +				 * Flash cache is big endian for parameter pages, at
>> +				 * least on STB SoCs
>> +				 */
>> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>> +		}
>> +	}
> 
> Perhaps we could have a single function that is statically assigned at
> probe time instead of a first helper with two conditions which calls in
> one case another hook... This can be simplified I guess.
> 
Well this will need to be done at the SoC specific implementation level 
(bcm<xxx>_nand.c) and each SoC will need to have either general data bus 
read func with is_param option or data_bus_read_page, 
data_bus_read_param.  Not sure how much this can be simplified... Or we 
have default implementation in brcmnand.c but then there is one 
condition check too. Page read is done at 512 bytes burst. One or two 
conditions check outside of the per 512 bytes read loop does not sounds 
too bad if performance is concern.

>> +}
>> +
>>   static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
>>   {
>>   
>> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,
>>   			native_cmd == CMD_PARAMETER_CHANGE_COL) {
>>   		/* Copy flash cache word-wise */
>>   		u32 *flash_cache = (u32 *)ctrl->flash_cache;
>> -		int i;
>>   
>>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
>>   
>> -		/*
>> -		 * Must cache the FLASH_CACHE now, since changes in
>> -		 * SECTOR_SIZE_1K may invalidate it
>> -		 */
>> -		for (i = 0; i < FC_WORDS; i++)
>> -			/*
>> -			 * Flash cache is big endian for parameter pages, at
>> -			 * least on STB SoCs
>> -			 */
>> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
>> +				   FC_WORDS, true);
>>   
>>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
>>   
>> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>   {
>>   	struct brcmnand_host *host = nand_get_controller_data(chip);
>>   	struct brcmnand_controller *ctrl = host->ctrl;
>> -	int i, j, ret = 0;
>> +	int i, ret = 0;
>>   
>>   	brcmnand_clear_ecc_addr(ctrl);
>>   
>> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>   		if (likely(buf)) {
>>   			brcmnand_soc_data_bus_prepare(ctrl->soc, false);
>>   
>> -			for (j = 0; j < FC_WORDS; j++, buf++)
>> -				*buf = brcmnand_read_fc(ctrl, j);
>> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
>> +					FC_WORDS, false);
>> +			buf += FC_WORDS;
>>   
>>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);
>>   		}
>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>> index f1f93d85f50d..88819bc395f8 100644
>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>> @@ -24,6 +24,8 @@ struct brcmnand_soc {
>>   	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
>>   	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
>>   				 bool is_param);
>> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
>> +				 u32 *buffer, int fc_words, bool is_param);
>>   	const struct brcmnand_io_ops *ops;
>>   };
>>   
> 
> 
> Thanks,
> Miquèl
>
Miquel Raynal June 8, 2023, 6:15 a.m. UTC | #5
Hi William,

william.zhang@broadcom.com wrote on Wed, 7 Jun 2023 13:12:02 -0700:

> Hi Miquel,
> 
> On 06/07/2023 01:20 AM, Miquel Raynal wrote:
> > Hi William,
> > 
> > william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:
> >   
> >> The BCMBCA broadband SoC integrates the NAND controller differently than
> >> STB, iProc and other SoCs.  It has different endianness for NAND cache
> >> data and ONFI parameter data.
> >>
> >> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
> >> and performance improvement using the optimized memcpy function on NAND
> >> cache memory.
> >>
> >> Signed-off-by: William Zhang <william.zhang@broadcom.com>
> >> ---
> >>
> >>   drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
> >>   drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
> >>   drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
> >>   3 files changed, 68 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >> index 7e48b6a0bfa2..899103a62c98 100644
> >> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >> @@ -26,6 +26,18 @@ enum {
> >>   	BCMBCA_CTLRDY		= BIT(4),
> >>   };  
> >>   >> +#if defined(CONFIG_ARM64)  
> >> +#define ALIGN_REQ		8
> >> +#else
> >> +#define ALIGN_REQ		4
> >> +#endif
> >> +
> >> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
> >> +{
> >> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
> >> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
> >> +}
> >> +
> >>   static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
> >>   {
> >>   	struct bcmbca_nand_soc *priv =
> >> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
> >>   	brcmnand_writel(val, mmio);
> >>   }  
> >>   >> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,  
> >> +				 void __iomem *flash_cache,  u32 *buffer,
> >> +				 int fc_words, bool is_param)
> >> +{
> >> +	int i;
> >> +
> >> +	if (!is_param) {
> >> +		/*
> >> +		 * memcpy can do unaligned aligned access depending on source
> >> +		 * and dest address, which is incompatible with nand cache. Fallback
> >> +		 * to the memcpy for io version
> >> +		 */
> >> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
> >> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
> >> +		else
> >> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
> >> +	} else {
> >> +		/* Flash cache has same endian as the host for parameter pages */
> >> +		for (i = 0; i < fc_words; i++, buffer++)
> >> +			*buffer = __raw_readl(flash_cache + i * 4);
> >> +	}
> >> +}
> >> +
> >>   static int bcmbca_nand_probe(struct platform_device *pdev)
> >>   {
> >>   	struct device *dev = &pdev->dev;
> >> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)  
> >>   >>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;  
> >>   	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
> >> +	soc->read_data_bus = bcmbca_read_data_bus;  
> >>   >>   	return brcmnand_probe(pdev, soc);  
> >>   }
> >> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >> index d920e88c7f5b..656be4d73016 100644
> >> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
> >>   	return brcmnand_readl(ctrl->edu_base + offs);
> >>   }  
> >>   >> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,  
> >> +					   void __iomem *flash_cache, u32 *buffer,
> >> +					   int fc_words, bool is_param)  
> > 
> > I strongly dislike this "is_param" boolean.
> > 
> > When is the data in host endianness? When is it not?  
> This is little bit complicated.  We have two type data read from nand cache. One for page read and the other for parameter and onfi data read from the controller side. But it depends on how SoC integrate the nand cache to system. In broadband SoC, both page and parameter data are in host endianess but other SoCs is not the same.
> 
> I am open to suggestion for is_param function argument but to factor out this common code in more structured way, I don't see other way around.

Alright, so this is SoC dependent, very well -> a (sub)compatible per
SoC + platform data associated to it with the right function.

> > If we think about an exec_op() conversion and drop cmdfunc(), what
> > would be the discriminant?
> >   
> If we need to implement exec_op in the future,  the data is not coming from nand cache but some other low level data register which may not subject to the endianess issue.

Can't you use the same cache all the time here as well then? And avoid
the need for this overly complex logic?

> 
> >> +{
> >> +	struct brcmnand_soc *soc = ctrl->soc;
> >> +	int i;
> >> +
> >> +	if (soc->read_data_bus) {
> >> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
> >> +	} else {
> >> +		if (!is_param) {
> >> +			for (i = 0; i < fc_words; i++, buffer++)
> >> +				*buffer = brcmnand_read_fc(ctrl, i);
> >> +		} else {
> >> +			for (i = 0; i < fc_words; i++)
> >> +				/*
> >> +				 * Flash cache is big endian for parameter pages, at
> >> +				 * least on STB SoCs
> >> +				 */
> >> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> >> +		}
> >> +	}
> >> +}
> >> +
> >>   static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
> >>   {  
> >>   >> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,  
> >>   			native_cmd == CMD_PARAMETER_CHANGE_COL) {
> >>   		/* Copy flash cache word-wise */
> >>   		u32 *flash_cache = (u32 *)ctrl->flash_cache;
> >> -		int i;  
> >>   >>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
> >>   >> -		/*  
> >> -		 * Must cache the FLASH_CACHE now, since changes in
> >> -		 * SECTOR_SIZE_1K may invalidate it
> >> -		 */
> >> -		for (i = 0; i < FC_WORDS; i++)
> >> -			/*
> >> -			 * Flash cache is big endian for parameter pages, at
> >> -			 * least on STB SoCs
> >> -			 */
> >> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> >> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
> >> +				   FC_WORDS, true);  
> >>   >>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
> >>   >> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,  
> >>   {
> >>   	struct brcmnand_host *host = nand_get_controller_data(chip);
> >>   	struct brcmnand_controller *ctrl = host->ctrl;
> >> -	int i, j, ret = 0;
> >> +	int i, ret = 0;  
> >>   >>   	brcmnand_clear_ecc_addr(ctrl);
> >>   >> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,  
> >>   		if (likely(buf)) {
> >>   			brcmnand_soc_data_bus_prepare(ctrl->soc, false);  
> >>   >> -			for (j = 0; j < FC_WORDS; j++, buf++)  
> >> -				*buf = brcmnand_read_fc(ctrl, j);
> >> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
> >> +					FC_WORDS, false);
> >> +			buf += FC_WORDS;  
> >>   >>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);  
> >>   		}
> >> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >> index f1f93d85f50d..88819bc395f8 100644
> >> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >> @@ -24,6 +24,8 @@ struct brcmnand_soc {
> >>   	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
> >>   	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
> >>   				 bool is_param);
> >> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
> >> +				 u32 *buffer, int fc_words, bool is_param);
> >>   	const struct brcmnand_io_ops *ops;
> >>   };  
> >>   > >   
> > Thanks,
> > Miquèl
> >   


Thanks,
Miquèl
Miquel Raynal June 8, 2023, 6:18 a.m. UTC | #6
Hi William,

william.zhang@broadcom.com wrote on Wed, 7 Jun 2023 13:24:23 -0700:

> Hi Miquel,
> 
> On 06/07/2023 01:22 AM, Miquel Raynal wrote:
> > Hi William,
> > 
> > william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:
> >   
> >> The BCMBCA broadband SoC integrates the NAND controller differently than
> >> STB, iProc and other SoCs.  It has different endianness for NAND cache
> >> data and ONFI parameter data.
> >>
> >> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
> >> and performance improvement using the optimized memcpy function on NAND
> >> cache memory.
> >>
> >> Signed-off-by: William Zhang <william.zhang@broadcom.com>
> >> ---
> >>
> >>   drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
> >>   drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
> >>   drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
> >>   3 files changed, 68 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >> index 7e48b6a0bfa2..899103a62c98 100644
> >> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >> @@ -26,6 +26,18 @@ enum {
> >>   	BCMBCA_CTLRDY		= BIT(4),
> >>   };  
> >>   >> +#if defined(CONFIG_ARM64)  
> >> +#define ALIGN_REQ		8
> >> +#else
> >> +#define ALIGN_REQ		4
> >> +#endif
> >> +
> >> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
> >> +{
> >> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
> >> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
> >> +}
> >> +
> >>   static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
> >>   {
> >>   	struct bcmbca_nand_soc *priv =
> >> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
> >>   	brcmnand_writel(val, mmio);
> >>   }  
> >>   >> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,  
> >> +				 void __iomem *flash_cache,  u32 *buffer,
> >> +				 int fc_words, bool is_param)
> >> +{
> >> +	int i;
> >> +
> >> +	if (!is_param) {
> >> +		/*
> >> +		 * memcpy can do unaligned aligned access depending on source
> >> +		 * and dest address, which is incompatible with nand cache. Fallback
> >> +		 * to the memcpy for io version
> >> +		 */
> >> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
> >> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
> >> +		else
> >> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
> >> +	} else {
> >> +		/* Flash cache has same endian as the host for parameter pages */
> >> +		for (i = 0; i < fc_words; i++, buffer++)
> >> +			*buffer = __raw_readl(flash_cache + i * 4);
> >> +	}
> >> +}
> >> +
> >>   static int bcmbca_nand_probe(struct platform_device *pdev)
> >>   {
> >>   	struct device *dev = &pdev->dev;
> >> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)  
> >>   >>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;  
> >>   	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
> >> +	soc->read_data_bus = bcmbca_read_data_bus;  
> >>   >>   	return brcmnand_probe(pdev, soc);  
> >>   }
> >> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >> index d920e88c7f5b..656be4d73016 100644
> >> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
> >>   	return brcmnand_readl(ctrl->edu_base + offs);
> >>   }  
> >>   >> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,  
> >> +					   void __iomem *flash_cache, u32 *buffer,
> >> +					   int fc_words, bool is_param)
> >> +{
> >> +	struct brcmnand_soc *soc = ctrl->soc;
> >> +	int i;
> >> +
> >> +	if (soc->read_data_bus) {
> >> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
> >> +	} else {
> >> +		if (!is_param) {
> >> +			for (i = 0; i < fc_words; i++, buffer++)
> >> +				*buffer = brcmnand_read_fc(ctrl, i);
> >> +		} else {
> >> +			for (i = 0; i < fc_words; i++)
> >> +				/*
> >> +				 * Flash cache is big endian for parameter pages, at
> >> +				 * least on STB SoCs
> >> +				 */
> >> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> >> +		}
> >> +	}  
> > 
> > Perhaps we could have a single function that is statically assigned at
> > probe time instead of a first helper with two conditions which calls in
> > one case another hook... This can be simplified I guess.
> >   
> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.

You told me in case we would use exec_op we could avoid the param
cache. If that's true then the whole support can be simplified.

>  Not sure how much this can be simplified... Or we have default
> implementation in brcmnand.c but then there is one condition check
> too. Page read is done at 512 bytes burst. One or two conditions
> check outside of the per 512 bytes read loop does not sounds too bad
> if performance is concern.

It is unreadable. That is my main concern.

> 
> >> +}
> >> +
> >>   static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
> >>   {  
> >>   >> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,  
> >>   			native_cmd == CMD_PARAMETER_CHANGE_COL) {
> >>   		/* Copy flash cache word-wise */
> >>   		u32 *flash_cache = (u32 *)ctrl->flash_cache;
> >> -		int i;  
> >>   >>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
> >>   >> -		/*  
> >> -		 * Must cache the FLASH_CACHE now, since changes in
> >> -		 * SECTOR_SIZE_1K may invalidate it
> >> -		 */
> >> -		for (i = 0; i < FC_WORDS; i++)
> >> -			/*
> >> -			 * Flash cache is big endian for parameter pages, at
> >> -			 * least on STB SoCs
> >> -			 */
> >> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> >> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
> >> +				   FC_WORDS, true);  
> >>   >>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
> >>   >> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,  
> >>   {
> >>   	struct brcmnand_host *host = nand_get_controller_data(chip);
> >>   	struct brcmnand_controller *ctrl = host->ctrl;
> >> -	int i, j, ret = 0;
> >> +	int i, ret = 0;  
> >>   >>   	brcmnand_clear_ecc_addr(ctrl);
> >>   >> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,  
> >>   		if (likely(buf)) {
> >>   			brcmnand_soc_data_bus_prepare(ctrl->soc, false);  
> >>   >> -			for (j = 0; j < FC_WORDS; j++, buf++)  
> >> -				*buf = brcmnand_read_fc(ctrl, j);
> >> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
> >> +					FC_WORDS, false);
> >> +			buf += FC_WORDS;  
> >>   >>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);  
> >>   		}
> >> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >> index f1f93d85f50d..88819bc395f8 100644
> >> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >> @@ -24,6 +24,8 @@ struct brcmnand_soc {
> >>   	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
> >>   	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
> >>   				 bool is_param);
> >> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
> >> +				 u32 *buffer, int fc_words, bool is_param);
> >>   	const struct brcmnand_io_ops *ops;
> >>   };  
> >>   > >   
> > Thanks,
> > Miquèl
> >   


Thanks,
Miquèl
William Zhang June 8, 2023, 7:04 p.m. UTC | #7
On 06/07/2023 11:15 PM, Miquel Raynal wrote:
> Hi William,
> 
> william.zhang@broadcom.com wrote on Wed, 7 Jun 2023 13:12:02 -0700:
> 
>> Hi Miquel,
>>
>> On 06/07/2023 01:20 AM, Miquel Raynal wrote:
>>> Hi William,
>>>
>>> william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:
>>>    
>>>> The BCMBCA broadband SoC integrates the NAND controller differently than
>>>> STB, iProc and other SoCs.  It has different endianness for NAND cache
>>>> data and ONFI parameter data.
>>>>
>>>> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
>>>> and performance improvement using the optimized memcpy function on NAND
>>>> cache memory.
>>>>
>>>> Signed-off-by: William Zhang <william.zhang@broadcom.com>
>>>> ---
>>>>
>>>>    drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
>>>>    drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
>>>>    drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
>>>>    3 files changed, 68 insertions(+), 14 deletions(-)
>>>>
>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>> index 7e48b6a0bfa2..899103a62c98 100644
>>>> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>> @@ -26,6 +26,18 @@ enum {
>>>>    	BCMBCA_CTLRDY		= BIT(4),
>>>>    };
>>>>    >> +#if defined(CONFIG_ARM64)
>>>> +#define ALIGN_REQ		8
>>>> +#else
>>>> +#define ALIGN_REQ		4
>>>> +#endif
>>>> +
>>>> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
>>>> +{
>>>> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
>>>> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
>>>> +}
>>>> +
>>>>    static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
>>>>    {
>>>>    	struct bcmbca_nand_soc *priv =
>>>> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
>>>>    	brcmnand_writel(val, mmio);
>>>>    }
>>>>    >> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,
>>>> +				 void __iomem *flash_cache,  u32 *buffer,
>>>> +				 int fc_words, bool is_param)
>>>> +{
>>>> +	int i;
>>>> +
>>>> +	if (!is_param) {
>>>> +		/*
>>>> +		 * memcpy can do unaligned aligned access depending on source
>>>> +		 * and dest address, which is incompatible with nand cache. Fallback
>>>> +		 * to the memcpy for io version
>>>> +		 */
>>>> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
>>>> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
>>>> +		else
>>>> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
>>>> +	} else {
>>>> +		/* Flash cache has same endian as the host for parameter pages */
>>>> +		for (i = 0; i < fc_words; i++, buffer++)
>>>> +			*buffer = __raw_readl(flash_cache + i * 4);
>>>> +	}
>>>> +}
>>>> +
>>>>    static int bcmbca_nand_probe(struct platform_device *pdev)
>>>>    {
>>>>    	struct device *dev = &pdev->dev;
>>>> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)
>>>>    >>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;
>>>>    	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
>>>> +	soc->read_data_bus = bcmbca_read_data_bus;
>>>>    >>   	return brcmnand_probe(pdev, soc);
>>>>    }
>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>> index d920e88c7f5b..656be4d73016 100644
>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
>>>>    	return brcmnand_readl(ctrl->edu_base + offs);
>>>>    }
>>>>    >> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,
>>>> +					   void __iomem *flash_cache, u32 *buffer,
>>>> +					   int fc_words, bool is_param)
>>>
>>> I strongly dislike this "is_param" boolean.
>>>
>>> When is the data in host endianness? When is it not?
>> This is little bit complicated.  We have two type data read from nand cache. One for page read and the other for parameter and onfi data read from the controller side. But it depends on how SoC integrate the nand cache to system. In broadband SoC, both page and parameter data are in host endianess but other SoCs is not the same.
>>
>> I am open to suggestion for is_param function argument but to factor out this common code in more structured way, I don't see other way around.
> 
> Alright, so this is SoC dependent, very well -> a (sub)compatible per
> SoC + platform data associated to it with the right function.
> 
Right we have per SoC compatible and can have per SoC implementation but 
I prefer to have a default implementation in the brcmnand.c because 
right now only bcmcba SoC need some different handling. The other four 
implementations are the same.

To make the code a little more readable and less complicated,  I am 
thinking to separate the brcmnand_read_data_bus into 
brcmnand_read_page_data and brcmnand_read_param_data as default in 
brcmnand.c. But bcmbca will override them. Would that be okay with you?

>>> If we think about an exec_op() conversion and drop cmdfunc(), what
>>> would be the discriminant?
>>>    
>> If we need to implement exec_op in the future,  the data is not coming from nand cache but some other low level data register which may not subject to the endianess issue.
> 
> Can't you use the same cache all the time here as well then? And avoid
> the need for this overly complex logic?
> 
Unfortunately exec_op will not use nand cache for parameter data read 
but some other low level data register. This is dictated by the controller.

>>
>>>> +{
>>>> +	struct brcmnand_soc *soc = ctrl->soc;
>>>> +	int i;
>>>> +
>>>> +	if (soc->read_data_bus) {
>>>> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
>>>> +	} else {
>>>> +		if (!is_param) {
>>>> +			for (i = 0; i < fc_words; i++, buffer++)
>>>> +				*buffer = brcmnand_read_fc(ctrl, i);
>>>> +		} else {
>>>> +			for (i = 0; i < fc_words; i++)
>>>> +				/*
>>>> +				 * Flash cache is big endian for parameter pages, at
>>>> +				 * least on STB SoCs
>>>> +				 */
>>>> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>>>> +		}
>>>> +	}
>>>> +}
>>>> +
>>>>    static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
>>>>    {
>>>>    >> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,
>>>>    			native_cmd == CMD_PARAMETER_CHANGE_COL) {
>>>>    		/* Copy flash cache word-wise */
>>>>    		u32 *flash_cache = (u32 *)ctrl->flash_cache;
>>>> -		int i;
>>>>    >>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
>>>>    >> -		/*
>>>> -		 * Must cache the FLASH_CACHE now, since changes in
>>>> -		 * SECTOR_SIZE_1K may invalidate it
>>>> -		 */
>>>> -		for (i = 0; i < FC_WORDS; i++)
>>>> -			/*
>>>> -			 * Flash cache is big endian for parameter pages, at
>>>> -			 * least on STB SoCs
>>>> -			 */
>>>> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>>>> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
>>>> +				   FC_WORDS, true);
>>>>    >>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
>>>>    >> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>>>    {
>>>>    	struct brcmnand_host *host = nand_get_controller_data(chip);
>>>>    	struct brcmnand_controller *ctrl = host->ctrl;
>>>> -	int i, j, ret = 0;
>>>> +	int i, ret = 0;
>>>>    >>   	brcmnand_clear_ecc_addr(ctrl);
>>>>    >> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>>>    		if (likely(buf)) {
>>>>    			brcmnand_soc_data_bus_prepare(ctrl->soc, false);
>>>>    >> -			for (j = 0; j < FC_WORDS; j++, buf++)
>>>> -				*buf = brcmnand_read_fc(ctrl, j);
>>>> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
>>>> +					FC_WORDS, false);
>>>> +			buf += FC_WORDS;
>>>>    >>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);
>>>>    		}
>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>> index f1f93d85f50d..88819bc395f8 100644
>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>> @@ -24,6 +24,8 @@ struct brcmnand_soc {
>>>>    	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
>>>>    	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
>>>>    				 bool is_param);
>>>> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
>>>> +				 u32 *buffer, int fc_words, bool is_param);
>>>>    	const struct brcmnand_io_ops *ops;
>>>>    };
>>>>    > >
>>> Thanks,
>>> Miquèl
>>>    
> 
> 
> Thanks,
> Miquèl
>
William Zhang June 8, 2023, 7:10 p.m. UTC | #8
On 06/07/2023 11:18 PM, Miquel Raynal wrote:
> Hi William,
> 
> william.zhang@broadcom.com wrote on Wed, 7 Jun 2023 13:24:23 -0700:
> 
>> Hi Miquel,
>>
>> On 06/07/2023 01:22 AM, Miquel Raynal wrote:
>>> Hi William,
>>>
>>> william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:
>>>    
>>>> The BCMBCA broadband SoC integrates the NAND controller differently than
>>>> STB, iProc and other SoCs.  It has different endianness for NAND cache
>>>> data and ONFI parameter data.
>>>>
>>>> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
>>>> and performance improvement using the optimized memcpy function on NAND
>>>> cache memory.
>>>>
>>>> Signed-off-by: William Zhang <william.zhang@broadcom.com>
>>>> ---
>>>>
>>>>    drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
>>>>    drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
>>>>    drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
>>>>    3 files changed, 68 insertions(+), 14 deletions(-)
>>>>
>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>> index 7e48b6a0bfa2..899103a62c98 100644
>>>> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>> @@ -26,6 +26,18 @@ enum {
>>>>    	BCMBCA_CTLRDY		= BIT(4),
>>>>    };
>>>>    >> +#if defined(CONFIG_ARM64)
>>>> +#define ALIGN_REQ		8
>>>> +#else
>>>> +#define ALIGN_REQ		4
>>>> +#endif
>>>> +
>>>> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
>>>> +{
>>>> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
>>>> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
>>>> +}
>>>> +
>>>>    static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
>>>>    {
>>>>    	struct bcmbca_nand_soc *priv =
>>>> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
>>>>    	brcmnand_writel(val, mmio);
>>>>    }
>>>>    >> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,
>>>> +				 void __iomem *flash_cache,  u32 *buffer,
>>>> +				 int fc_words, bool is_param)
>>>> +{
>>>> +	int i;
>>>> +
>>>> +	if (!is_param) {
>>>> +		/*
>>>> +		 * memcpy can do unaligned aligned access depending on source
>>>> +		 * and dest address, which is incompatible with nand cache. Fallback
>>>> +		 * to the memcpy for io version
>>>> +		 */
>>>> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
>>>> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
>>>> +		else
>>>> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
>>>> +	} else {
>>>> +		/* Flash cache has same endian as the host for parameter pages */
>>>> +		for (i = 0; i < fc_words; i++, buffer++)
>>>> +			*buffer = __raw_readl(flash_cache + i * 4);
>>>> +	}
>>>> +}
>>>> +
>>>>    static int bcmbca_nand_probe(struct platform_device *pdev)
>>>>    {
>>>>    	struct device *dev = &pdev->dev;
>>>> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)
>>>>    >>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;
>>>>    	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
>>>> +	soc->read_data_bus = bcmbca_read_data_bus;
>>>>    >>   	return brcmnand_probe(pdev, soc);
>>>>    }
>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>> index d920e88c7f5b..656be4d73016 100644
>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
>>>>    	return brcmnand_readl(ctrl->edu_base + offs);
>>>>    }
>>>>    >> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,
>>>> +					   void __iomem *flash_cache, u32 *buffer,
>>>> +					   int fc_words, bool is_param)
>>>> +{
>>>> +	struct brcmnand_soc *soc = ctrl->soc;
>>>> +	int i;
>>>> +
>>>> +	if (soc->read_data_bus) {
>>>> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
>>>> +	} else {
>>>> +		if (!is_param) {
>>>> +			for (i = 0; i < fc_words; i++, buffer++)
>>>> +				*buffer = brcmnand_read_fc(ctrl, i);
>>>> +		} else {
>>>> +			for (i = 0; i < fc_words; i++)
>>>> +				/*
>>>> +				 * Flash cache is big endian for parameter pages, at
>>>> +				 * least on STB SoCs
>>>> +				 */
>>>> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>>>> +		}
>>>> +	}
>>>
>>> Perhaps we could have a single function that is statically assigned at
>>> probe time instead of a first helper with two conditions which calls in
>>> one case another hook... This can be simplified I guess.
>>>    
>> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.
> 
> You told me in case we would use exec_op we could avoid the param
> cache. If that's true then the whole support can be simplified.
> 
Correct we may possibly unified the parameter data read but exec_op is 
long shot and we are not fully ready for that yet. It also depends on if 
the low level data register has endianess difference for the parameter 
data between difference SoCs.

So I would like to push the current implementation and we can explore 
the exec_op option late which will be a much big and complete different 
implementation.

>>   Not sure how much this can be simplified... Or we have default
>> implementation in brcmnand.c but then there is one condition check
>> too. Page read is done at 512 bytes burst. One or two conditions
>> check outside of the per 512 bytes read loop does not sounds too bad
>> if performance is concern.
> 
> It is unreadable. That is my main concern.
> 
>>
>>>> +}
>>>> +
>>>>    static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
>>>>    {
>>>>    >> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,
>>>>    			native_cmd == CMD_PARAMETER_CHANGE_COL) {
>>>>    		/* Copy flash cache word-wise */
>>>>    		u32 *flash_cache = (u32 *)ctrl->flash_cache;
>>>> -		int i;
>>>>    >>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
>>>>    >> -		/*
>>>> -		 * Must cache the FLASH_CACHE now, since changes in
>>>> -		 * SECTOR_SIZE_1K may invalidate it
>>>> -		 */
>>>> -		for (i = 0; i < FC_WORDS; i++)
>>>> -			/*
>>>> -			 * Flash cache is big endian for parameter pages, at
>>>> -			 * least on STB SoCs
>>>> -			 */
>>>> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>>>> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
>>>> +				   FC_WORDS, true);
>>>>    >>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
>>>>    >> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>>>    {
>>>>    	struct brcmnand_host *host = nand_get_controller_data(chip);
>>>>    	struct brcmnand_controller *ctrl = host->ctrl;
>>>> -	int i, j, ret = 0;
>>>> +	int i, ret = 0;
>>>>    >>   	brcmnand_clear_ecc_addr(ctrl);
>>>>    >> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>>>    		if (likely(buf)) {
>>>>    			brcmnand_soc_data_bus_prepare(ctrl->soc, false);
>>>>    >> -			for (j = 0; j < FC_WORDS; j++, buf++)
>>>> -				*buf = brcmnand_read_fc(ctrl, j);
>>>> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
>>>> +					FC_WORDS, false);
>>>> +			buf += FC_WORDS;
>>>>    >>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);
>>>>    		}
>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>> index f1f93d85f50d..88819bc395f8 100644
>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>> @@ -24,6 +24,8 @@ struct brcmnand_soc {
>>>>    	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
>>>>    	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
>>>>    				 bool is_param);
>>>> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
>>>> +				 u32 *buffer, int fc_words, bool is_param);
>>>>    	const struct brcmnand_io_ops *ops;
>>>>    };
>>>>    > >
>>> Thanks,
>>> Miquèl
>>>    
> 
> 
> Thanks,
> Miquèl
>
Miquel Raynal June 9, 2023, 8:35 a.m. UTC | #9
Hi William,

william.zhang@broadcom.com wrote on Thu, 8 Jun 2023 12:10:06 -0700:

> On 06/07/2023 11:18 PM, Miquel Raynal wrote:
> > Hi William,
> > 
> > william.zhang@broadcom.com wrote on Wed, 7 Jun 2023 13:24:23 -0700:
> >   
> >> Hi Miquel,
> >>
> >> On 06/07/2023 01:22 AM, Miquel Raynal wrote:  
> >>> Hi William,
> >>>
> >>> william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:  
> >>>    >>>> The BCMBCA broadband SoC integrates the NAND controller differently than  
> >>>> STB, iProc and other SoCs.  It has different endianness for NAND cache
> >>>> data and ONFI parameter data.
> >>>>
> >>>> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
> >>>> and performance improvement using the optimized memcpy function on NAND
> >>>> cache memory.
> >>>>
> >>>> Signed-off-by: William Zhang <william.zhang@broadcom.com>
> >>>> ---
> >>>>
> >>>>    drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
> >>>>    drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
> >>>>    drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
> >>>>    3 files changed, 68 insertions(+), 14 deletions(-)
> >>>>
> >>>> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >>>> index 7e48b6a0bfa2..899103a62c98 100644
> >>>> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >>>> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >>>> @@ -26,6 +26,18 @@ enum {
> >>>>    	BCMBCA_CTLRDY		= BIT(4),
> >>>>    };  
> >>>>    >> +#if defined(CONFIG_ARM64)  
> >>>> +#define ALIGN_REQ		8
> >>>> +#else
> >>>> +#define ALIGN_REQ		4
> >>>> +#endif
> >>>> +
> >>>> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
> >>>> +{
> >>>> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
> >>>> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
> >>>> +}
> >>>> +
> >>>>    static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
> >>>>    {
> >>>>    	struct bcmbca_nand_soc *priv =
> >>>> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
> >>>>    	brcmnand_writel(val, mmio);
> >>>>    }  
> >>>>    >> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,  
> >>>> +				 void __iomem *flash_cache,  u32 *buffer,
> >>>> +				 int fc_words, bool is_param)
> >>>> +{
> >>>> +	int i;
> >>>> +
> >>>> +	if (!is_param) {
> >>>> +		/*
> >>>> +		 * memcpy can do unaligned aligned access depending on source
> >>>> +		 * and dest address, which is incompatible with nand cache. Fallback
> >>>> +		 * to the memcpy for io version
> >>>> +		 */
> >>>> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
> >>>> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
> >>>> +		else
> >>>> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
> >>>> +	} else {
> >>>> +		/* Flash cache has same endian as the host for parameter pages */
> >>>> +		for (i = 0; i < fc_words; i++, buffer++)
> >>>> +			*buffer = __raw_readl(flash_cache + i * 4);
> >>>> +	}
> >>>> +}
> >>>> +
> >>>>    static int bcmbca_nand_probe(struct platform_device *pdev)
> >>>>    {
> >>>>    	struct device *dev = &pdev->dev;
> >>>> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)  
> >>>>    >>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;  
> >>>>    	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
> >>>> +	soc->read_data_bus = bcmbca_read_data_bus;  
> >>>>    >>   	return brcmnand_probe(pdev, soc);  
> >>>>    }
> >>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >>>> index d920e88c7f5b..656be4d73016 100644
> >>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >>>> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
> >>>>    	return brcmnand_readl(ctrl->edu_base + offs);
> >>>>    }  
> >>>>    >> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,  
> >>>> +					   void __iomem *flash_cache, u32 *buffer,
> >>>> +					   int fc_words, bool is_param)
> >>>> +{
> >>>> +	struct brcmnand_soc *soc = ctrl->soc;
> >>>> +	int i;
> >>>> +
> >>>> +	if (soc->read_data_bus) {
> >>>> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
> >>>> +	} else {
> >>>> +		if (!is_param) {
> >>>> +			for (i = 0; i < fc_words; i++, buffer++)
> >>>> +				*buffer = brcmnand_read_fc(ctrl, i);
> >>>> +		} else {
> >>>> +			for (i = 0; i < fc_words; i++)
> >>>> +				/*
> >>>> +				 * Flash cache is big endian for parameter pages, at
> >>>> +				 * least on STB SoCs
> >>>> +				 */
> >>>> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> >>>> +		}
> >>>> +	}  
> >>>
> >>> Perhaps we could have a single function that is statically assigned at
> >>> probe time instead of a first helper with two conditions which calls in
> >>> one case another hook... This can be simplified I guess.  
> >>>    >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.  
> > 
> > You told me in case we would use exec_op we could avoid the param
> > cache. If that's true then the whole support can be simplified.
> >   
> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.
> 
> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.

I am sorry but this series is totally backwards, you're trying to guess
what comes next with the 'is_param' thing, it's exactly what we are
fighting against since 2017. There are plenty of ->exec_op()
conversions out there, I don't believe this one will be harder. You
need to convert the driver to this new API and get rid of this whole
endianness non-sense to simplify a lot the driver.

> 
> >>   Not sure how much this can be simplified... Or we have default
> >> implementation in brcmnand.c but then there is one condition check
> >> too. Page read is done at 512 bytes burst. One or two conditions
> >> check outside of the per 512 bytes read loop does not sounds too bad
> >> if performance is concern.  
> > 
> > It is unreadable. That is my main concern.
> >   
> >>  
> >>>> +}
> >>>> +
> >>>>    static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
> >>>>    {  
> >>>>    >> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,  
> >>>>    			native_cmd == CMD_PARAMETER_CHANGE_COL) {
> >>>>    		/* Copy flash cache word-wise */
> >>>>    		u32 *flash_cache = (u32 *)ctrl->flash_cache;
> >>>> -		int i;  
> >>>>    >>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
> >>>>    >> -		/*  
> >>>> -		 * Must cache the FLASH_CACHE now, since changes in
> >>>> -		 * SECTOR_SIZE_1K may invalidate it
> >>>> -		 */
> >>>> -		for (i = 0; i < FC_WORDS; i++)
> >>>> -			/*
> >>>> -			 * Flash cache is big endian for parameter pages, at
> >>>> -			 * least on STB SoCs
> >>>> -			 */
> >>>> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> >>>> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
> >>>> +				   FC_WORDS, true);  
> >>>>    >>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
> >>>>    >> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,  
> >>>>    {
> >>>>    	struct brcmnand_host *host = nand_get_controller_data(chip);
> >>>>    	struct brcmnand_controller *ctrl = host->ctrl;
> >>>> -	int i, j, ret = 0;
> >>>> +	int i, ret = 0;  
> >>>>    >>   	brcmnand_clear_ecc_addr(ctrl);
> >>>>    >> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,  
> >>>>    		if (likely(buf)) {
> >>>>    			brcmnand_soc_data_bus_prepare(ctrl->soc, false);  
> >>>>    >> -			for (j = 0; j < FC_WORDS; j++, buf++)  
> >>>> -				*buf = brcmnand_read_fc(ctrl, j);
> >>>> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
> >>>> +					FC_WORDS, false);
> >>>> +			buf += FC_WORDS;  
> >>>>    >>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);  
> >>>>    		}
> >>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >>>> index f1f93d85f50d..88819bc395f8 100644
> >>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >>>> @@ -24,6 +24,8 @@ struct brcmnand_soc {
> >>>>    	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
> >>>>    	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
> >>>>    				 bool is_param);
> >>>> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
> >>>> +				 u32 *buffer, int fc_words, bool is_param);
> >>>>    	const struct brcmnand_io_ops *ops;
> >>>>    };  
> >>>>    > >  
> >>> Thanks,
> >>> Miquèl  
> >>>    > >   
> > Thanks,
> > Miquèl
> >   


Thanks,
Miquèl
William Zhang June 9, 2023, 7:16 p.m. UTC | #10
Hi Miquel,

On 06/09/2023 01:35 AM, Miquel Raynal wrote:
> Hi William,
> 
> william.zhang@broadcom.com wrote on Thu, 8 Jun 2023 12:10:06 -0700:
> 
>> On 06/07/2023 11:18 PM, Miquel Raynal wrote:
>>> Hi William,
>>>
>>> william.zhang@broadcom.com wrote on Wed, 7 Jun 2023 13:24:23 -0700:
>>>    
>>>> Hi Miquel,
>>>>
>>>> On 06/07/2023 01:22 AM, Miquel Raynal wrote:
>>>>> Hi William,
>>>>>
>>>>> william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:
>>>>>     >>>> The BCMBCA broadband SoC integrates the NAND controller differently than
>>>>>> STB, iProc and other SoCs.  It has different endianness for NAND cache
>>>>>> data and ONFI parameter data.
>>>>>>
>>>>>> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
>>>>>> and performance improvement using the optimized memcpy function on NAND
>>>>>> cache memory.
>>>>>>
>>>>>> Signed-off-by: William Zhang <william.zhang@broadcom.com>
>>>>>> ---
>>>>>>
>>>>>>     drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
>>>>>>     drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
>>>>>>     drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
>>>>>>     3 files changed, 68 insertions(+), 14 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>>>> index 7e48b6a0bfa2..899103a62c98 100644
>>>>>> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>>>> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>>>> @@ -26,6 +26,18 @@ enum {
>>>>>>     	BCMBCA_CTLRDY		= BIT(4),
>>>>>>     };
>>>>>>     >> +#if defined(CONFIG_ARM64)
>>>>>> +#define ALIGN_REQ		8
>>>>>> +#else
>>>>>> +#define ALIGN_REQ		4
>>>>>> +#endif
>>>>>> +
>>>>>> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
>>>>>> +{
>>>>>> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
>>>>>> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
>>>>>> +}
>>>>>> +
>>>>>>     static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
>>>>>>     {
>>>>>>     	struct bcmbca_nand_soc *priv =
>>>>>> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
>>>>>>     	brcmnand_writel(val, mmio);
>>>>>>     }
>>>>>>     >> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,
>>>>>> +				 void __iomem *flash_cache,  u32 *buffer,
>>>>>> +				 int fc_words, bool is_param)
>>>>>> +{
>>>>>> +	int i;
>>>>>> +
>>>>>> +	if (!is_param) {
>>>>>> +		/*
>>>>>> +		 * memcpy can do unaligned aligned access depending on source
>>>>>> +		 * and dest address, which is incompatible with nand cache. Fallback
>>>>>> +		 * to the memcpy for io version
>>>>>> +		 */
>>>>>> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
>>>>>> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
>>>>>> +		else
>>>>>> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
>>>>>> +	} else {
>>>>>> +		/* Flash cache has same endian as the host for parameter pages */
>>>>>> +		for (i = 0; i < fc_words; i++, buffer++)
>>>>>> +			*buffer = __raw_readl(flash_cache + i * 4);
>>>>>> +	}
>>>>>> +}
>>>>>> +
>>>>>>     static int bcmbca_nand_probe(struct platform_device *pdev)
>>>>>>     {
>>>>>>     	struct device *dev = &pdev->dev;
>>>>>> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)
>>>>>>     >>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;
>>>>>>     	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
>>>>>> +	soc->read_data_bus = bcmbca_read_data_bus;
>>>>>>     >>   	return brcmnand_probe(pdev, soc);
>>>>>>     }
>>>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>>>> index d920e88c7f5b..656be4d73016 100644
>>>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>>>> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
>>>>>>     	return brcmnand_readl(ctrl->edu_base + offs);
>>>>>>     }
>>>>>>     >> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,
>>>>>> +					   void __iomem *flash_cache, u32 *buffer,
>>>>>> +					   int fc_words, bool is_param)
>>>>>> +{
>>>>>> +	struct brcmnand_soc *soc = ctrl->soc;
>>>>>> +	int i;
>>>>>> +
>>>>>> +	if (soc->read_data_bus) {
>>>>>> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
>>>>>> +	} else {
>>>>>> +		if (!is_param) {
>>>>>> +			for (i = 0; i < fc_words; i++, buffer++)
>>>>>> +				*buffer = brcmnand_read_fc(ctrl, i);
>>>>>> +		} else {
>>>>>> +			for (i = 0; i < fc_words; i++)
>>>>>> +				/*
>>>>>> +				 * Flash cache is big endian for parameter pages, at
>>>>>> +				 * least on STB SoCs
>>>>>> +				 */
>>>>>> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>>>>>> +		}
>>>>>> +	}
>>>>>
>>>>> Perhaps we could have a single function that is statically assigned at
>>>>> probe time instead of a first helper with two conditions which calls in
>>>>> one case another hook... This can be simplified I guess.
>>>>>     >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.
>>>
>>> You told me in case we would use exec_op we could avoid the param
>>> cache. If that's true then the whole support can be simplified.
>>>    
>> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.
>>
>> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.
> 
> I am sorry but this series is totally backwards, you're trying to guess
> what comes next with the 'is_param' thing, it's exactly what we are
> fighting against since 2017. There are plenty of ->exec_op()
> conversions out there, I don't believe this one will be harder. You
> need to convert the driver to this new API and get rid of this whole
> endianness non-sense to simplify a lot the driver.
> 
I am not guessing anything but just factor out the existing common nand 
cache read logic into the single default function(or one for page read 
and another for parameter read as I mentioned in another thread) and 
allow SoC to overrides the implementation when needed.

I agree ->exec_op can possibly get rid of the parameter page read 
function and is the way to go. But it won't help on the page read for 
endianess. It's not that I am against exec_op but I want to take one 
step a time and I'd like to get these fixes and support for bcmbca soc 
first and then work on the exec_op API to minimize the change and reduce 
the risk.

>>
>>>>    Not sure how much this can be simplified... Or we have default
>>>> implementation in brcmnand.c but then there is one condition check
>>>> too. Page read is done at 512 bytes burst. One or two conditions
>>>> check outside of the per 512 bytes read loop does not sounds too bad
>>>> if performance is concern.
>>>
>>> It is unreadable. That is my main concern.
>>>    
>>>>   
>>>>>> +}
>>>>>> +
>>>>>>     static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
>>>>>>     {
>>>>>>     >> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,
>>>>>>     			native_cmd == CMD_PARAMETER_CHANGE_COL) {
>>>>>>     		/* Copy flash cache word-wise */
>>>>>>     		u32 *flash_cache = (u32 *)ctrl->flash_cache;
>>>>>> -		int i;
>>>>>>     >>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
>>>>>>     >> -		/*
>>>>>> -		 * Must cache the FLASH_CACHE now, since changes in
>>>>>> -		 * SECTOR_SIZE_1K may invalidate it
>>>>>> -		 */
>>>>>> -		for (i = 0; i < FC_WORDS; i++)
>>>>>> -			/*
>>>>>> -			 * Flash cache is big endian for parameter pages, at
>>>>>> -			 * least on STB SoCs
>>>>>> -			 */
>>>>>> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>>>>>> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
>>>>>> +				   FC_WORDS, true);
>>>>>>     >>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
>>>>>>     >> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>>>>>     {
>>>>>>     	struct brcmnand_host *host = nand_get_controller_data(chip);
>>>>>>     	struct brcmnand_controller *ctrl = host->ctrl;
>>>>>> -	int i, j, ret = 0;
>>>>>> +	int i, ret = 0;
>>>>>>     >>   	brcmnand_clear_ecc_addr(ctrl);
>>>>>>     >> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>>>>>     		if (likely(buf)) {
>>>>>>     			brcmnand_soc_data_bus_prepare(ctrl->soc, false);
>>>>>>     >> -			for (j = 0; j < FC_WORDS; j++, buf++)
>>>>>> -				*buf = brcmnand_read_fc(ctrl, j);
>>>>>> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
>>>>>> +					FC_WORDS, false);
>>>>>> +			buf += FC_WORDS;
>>>>>>     >>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);
>>>>>>     		}
>>>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>>>> index f1f93d85f50d..88819bc395f8 100644
>>>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>>>> @@ -24,6 +24,8 @@ struct brcmnand_soc {
>>>>>>     	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
>>>>>>     	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
>>>>>>     				 bool is_param);
>>>>>> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
>>>>>> +				 u32 *buffer, int fc_words, bool is_param);
>>>>>>     	const struct brcmnand_io_ops *ops;
>>>>>>     };
>>>>>>     > >
>>>>> Thanks,
>>>>> Miquèl
>>>>>     > >
>>> Thanks,
>>> Miquèl
>>>    
> 
> 
> Thanks,
> Miquèl
>
Miquel Raynal June 12, 2023, 5:49 p.m. UTC | #11
Hi William,

william.zhang@broadcom.com wrote on Fri, 9 Jun 2023 12:16:27 -0700:

> Hi Miquel,
> 
> On 06/09/2023 01:35 AM, Miquel Raynal wrote:
> > Hi William,
> > 
> > william.zhang@broadcom.com wrote on Thu, 8 Jun 2023 12:10:06 -0700:
> >   
> >> On 06/07/2023 11:18 PM, Miquel Raynal wrote:  
> >>> Hi William,
> >>>
> >>> william.zhang@broadcom.com wrote on Wed, 7 Jun 2023 13:24:23 -0700:  
> >>>    >>>> Hi Miquel,  
> >>>>
> >>>> On 06/07/2023 01:22 AM, Miquel Raynal wrote:  
> >>>>> Hi William,
> >>>>>
> >>>>> william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:  
> >>>>>     >>>> The BCMBCA broadband SoC integrates the NAND controller differently than  
> >>>>>> STB, iProc and other SoCs.  It has different endianness for NAND cache
> >>>>>> data and ONFI parameter data.
> >>>>>>
> >>>>>> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
> >>>>>> and performance improvement using the optimized memcpy function on NAND
> >>>>>> cache memory.
> >>>>>>
> >>>>>> Signed-off-by: William Zhang <william.zhang@broadcom.com>
> >>>>>> ---
> >>>>>>
> >>>>>>     drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
> >>>>>>     drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
> >>>>>>     drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
> >>>>>>     3 files changed, 68 insertions(+), 14 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >>>>>> index 7e48b6a0bfa2..899103a62c98 100644
> >>>>>> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >>>>>> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
> >>>>>> @@ -26,6 +26,18 @@ enum {
> >>>>>>     	BCMBCA_CTLRDY		= BIT(4),
> >>>>>>     };  
> >>>>>>     >> +#if defined(CONFIG_ARM64)  
> >>>>>> +#define ALIGN_REQ		8
> >>>>>> +#else
> >>>>>> +#define ALIGN_REQ		4
> >>>>>> +#endif
> >>>>>> +
> >>>>>> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
> >>>>>> +{
> >>>>>> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
> >>>>>> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
> >>>>>> +}
> >>>>>> +
> >>>>>>     static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
> >>>>>>     {
> >>>>>>     	struct bcmbca_nand_soc *priv =
> >>>>>> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
> >>>>>>     	brcmnand_writel(val, mmio);
> >>>>>>     }  
> >>>>>>     >> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,  
> >>>>>> +				 void __iomem *flash_cache,  u32 *buffer,
> >>>>>> +				 int fc_words, bool is_param)
> >>>>>> +{
> >>>>>> +	int i;
> >>>>>> +
> >>>>>> +	if (!is_param) {
> >>>>>> +		/*
> >>>>>> +		 * memcpy can do unaligned aligned access depending on source
> >>>>>> +		 * and dest address, which is incompatible with nand cache. Fallback
> >>>>>> +		 * to the memcpy for io version
> >>>>>> +		 */
> >>>>>> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
> >>>>>> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
> >>>>>> +		else
> >>>>>> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
> >>>>>> +	} else {
> >>>>>> +		/* Flash cache has same endian as the host for parameter pages */
> >>>>>> +		for (i = 0; i < fc_words; i++, buffer++)
> >>>>>> +			*buffer = __raw_readl(flash_cache + i * 4);
> >>>>>> +	}
> >>>>>> +}
> >>>>>> +
> >>>>>>     static int bcmbca_nand_probe(struct platform_device *pdev)
> >>>>>>     {
> >>>>>>     	struct device *dev = &pdev->dev;
> >>>>>> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)  
> >>>>>>     >>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;  
> >>>>>>     	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
> >>>>>> +	soc->read_data_bus = bcmbca_read_data_bus;  
> >>>>>>     >>   	return brcmnand_probe(pdev, soc);  
> >>>>>>     }
> >>>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >>>>>> index d920e88c7f5b..656be4d73016 100644
> >>>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >>>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
> >>>>>> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
> >>>>>>     	return brcmnand_readl(ctrl->edu_base + offs);
> >>>>>>     }  
> >>>>>>     >> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,  
> >>>>>> +					   void __iomem *flash_cache, u32 *buffer,
> >>>>>> +					   int fc_words, bool is_param)
> >>>>>> +{
> >>>>>> +	struct brcmnand_soc *soc = ctrl->soc;
> >>>>>> +	int i;
> >>>>>> +
> >>>>>> +	if (soc->read_data_bus) {
> >>>>>> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
> >>>>>> +	} else {
> >>>>>> +		if (!is_param) {
> >>>>>> +			for (i = 0; i < fc_words; i++, buffer++)
> >>>>>> +				*buffer = brcmnand_read_fc(ctrl, i);
> >>>>>> +		} else {
> >>>>>> +			for (i = 0; i < fc_words; i++)
> >>>>>> +				/*
> >>>>>> +				 * Flash cache is big endian for parameter pages, at
> >>>>>> +				 * least on STB SoCs
> >>>>>> +				 */
> >>>>>> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> >>>>>> +		}
> >>>>>> +	}  
> >>>>>
> >>>>> Perhaps we could have a single function that is statically assigned at
> >>>>> probe time instead of a first helper with two conditions which calls in
> >>>>> one case another hook... This can be simplified I guess.  
> >>>>>     >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.  
> >>>
> >>> You told me in case we would use exec_op we could avoid the param
> >>> cache. If that's true then the whole support can be simplified.  
> >>>    >> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.  
> >>
> >> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.  
> > 
> > I am sorry but this series is totally backwards, you're trying to guess
> > what comes next with the 'is_param' thing, it's exactly what we are
> > fighting against since 2017. There are plenty of ->exec_op()
> > conversions out there, I don't believe this one will be harder. You
> > need to convert the driver to this new API and get rid of this whole
> > endianness non-sense to simplify a lot the driver.
> >   
> I am not guessing anything but just factor out the existing common nand cache read logic into the single default function(or one for page read and another for parameter read as I mentioned in another thread) and allow SoC to overrides the implementation when needed.

No, you are trying to guess what type of read the core is performing,
either a regular data page read or a parameter page read.

> I agree ->exec_op can possibly get rid of the parameter page read function and is the way to go. But it won't help on the page read for endianess.

You told me there is no endianess issue with the data pages, so why it
won't help on the page read?

> It's not that I am against exec_op but I want to take one step a time
> and I'd like to get these fixes

I don't see any fix here? Let me know if I am missing something but
right now I see a new version of the controller being supported with
its own constraints. If you are fixing existing code for already
supported platform, then make it clear and we can discuss this. But if
you just want to support the bcmbca flavor, then there is no risk
mitigation involved here, and a conversion is the right step :)

> and support for bcmbca soc first and
> then work on the exec_op API to minimize the change and reduce the
> risk.
> 
> >>  
> >>>>    Not sure how much this can be simplified... Or we have default
> >>>> implementation in brcmnand.c but then there is one condition check
> >>>> too. Page read is done at 512 bytes burst. One or two conditions
> >>>> check outside of the per 512 bytes read loop does not sounds too bad
> >>>> if performance is concern.  
> >>>
> >>> It is unreadable. That is my main concern.  
> >>>    >>>>   >>>>>> +}  
> >>>>>> +
> >>>>>>     static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
> >>>>>>     {  
> >>>>>>     >> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,  
> >>>>>>     			native_cmd == CMD_PARAMETER_CHANGE_COL) {
> >>>>>>     		/* Copy flash cache word-wise */
> >>>>>>     		u32 *flash_cache = (u32 *)ctrl->flash_cache;
> >>>>>> -		int i;  
> >>>>>>     >>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
> >>>>>>     >> -		/*  
> >>>>>> -		 * Must cache the FLASH_CACHE now, since changes in
> >>>>>> -		 * SECTOR_SIZE_1K may invalidate it
> >>>>>> -		 */
> >>>>>> -		for (i = 0; i < FC_WORDS; i++)
> >>>>>> -			/*
> >>>>>> -			 * Flash cache is big endian for parameter pages, at
> >>>>>> -			 * least on STB SoCs
> >>>>>> -			 */
> >>>>>> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
> >>>>>> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
> >>>>>> +				   FC_WORDS, true);  
> >>>>>>     >>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
> >>>>>>     >> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,  
> >>>>>>     {
> >>>>>>     	struct brcmnand_host *host = nand_get_controller_data(chip);
> >>>>>>     	struct brcmnand_controller *ctrl = host->ctrl;
> >>>>>> -	int i, j, ret = 0;
> >>>>>> +	int i, ret = 0;  
> >>>>>>     >>   	brcmnand_clear_ecc_addr(ctrl);
> >>>>>>     >> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,  
> >>>>>>     		if (likely(buf)) {
> >>>>>>     			brcmnand_soc_data_bus_prepare(ctrl->soc, false);  
> >>>>>>     >> -			for (j = 0; j < FC_WORDS; j++, buf++)  
> >>>>>> -				*buf = brcmnand_read_fc(ctrl, j);
> >>>>>> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
> >>>>>> +					FC_WORDS, false);
> >>>>>> +			buf += FC_WORDS;  
> >>>>>>     >>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);  
> >>>>>>     		}
> >>>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >>>>>> index f1f93d85f50d..88819bc395f8 100644
> >>>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >>>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
> >>>>>> @@ -24,6 +24,8 @@ struct brcmnand_soc {
> >>>>>>     	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
> >>>>>>     	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
> >>>>>>     				 bool is_param);
> >>>>>> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
> >>>>>> +				 u32 *buffer, int fc_words, bool is_param);
> >>>>>>     	const struct brcmnand_io_ops *ops;
> >>>>>>     };  
> >>>>>>     > >  
> >>>>> Thanks,
> >>>>> Miquèl  
> >>>>>     > >  
> >>> Thanks,
> >>> Miquèl  
> >>>    > >   
> > Thanks,
> > Miquèl
> >   


Thanks,
Miquèl
Miquel Raynal June 12, 2023, 5:53 p.m. UTC | #12
Hello again,

> > >>>>> Perhaps we could have a single function that is statically assigned at
> > >>>>> probe time instead of a first helper with two conditions which calls in
> > >>>>> one case another hook... This can be simplified I guess.  
> > >>>>>     >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.  
> > >>>
> > >>> You told me in case we would use exec_op we could avoid the param
> > >>> cache. If that's true then the whole support can be simplified.  
> > >>>    >> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.  
> > >>
> > >> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.  
> > > 
> > > I am sorry but this series is totally backwards, you're trying to guess
> > > what comes next with the 'is_param' thing, it's exactly what we are
> > > fighting against since 2017. There are plenty of ->exec_op()
> > > conversions out there, I don't believe this one will be harder. You
> > > need to convert the driver to this new API and get rid of this whole
> > > endianness non-sense to simplify a lot the driver.
> > >   
> > I am not guessing anything but just factor out the existing common nand cache read logic into the single default function(or one for page read and another for parameter read as I mentioned in another thread) and allow SoC to overrides the implementation when needed.
> 
> No, you are trying to guess what type of read the core is performing,
> either a regular data page read or a parameter page read.
> 
> > I agree ->exec_op can possibly get rid of the parameter page read function and is the way to go. But it won't help on the page read for endianess.
> 
> You told me there is no endianess issue with the data pages, so why it
> won't help on the page read?
> 
> > It's not that I am against exec_op but I want to take one step a time
> > and I'd like to get these fixes
> 
> I don't see any fix here? Let me know if I am missing something but
> right now I see a new version of the controller being supported with
> its own constraints. If you are fixing existing code for already
> supported platform, then make it clear and we can discuss this. But if
> you just want to support the bcmbca flavor, then there is no risk
> mitigation involved here, and a conversion is the right step :)
> 

I forgot to mention: the exec_op conversion is almost ready, Boris
worked on it but he lacked the hardware so maybe you'll just need to
revive the few patches which target your platform and do a little bit of
debugging?

https://github.com/bbrezillon/linux/commits/nand/exec-op-conversion?after=8a3cf6fd25d5e15c6667f9e95c1fc86e4cb735e6+34&branch=nand%2Fexec-op-conversion&qualified_name=refs%2Fheads%2Fnand%2Fexec-op-conversion

Cheers,
Miquèl
William Zhang June 12, 2023, 7:03 p.m. UTC | #13
On 06/12/2023 10:49 AM, Miquel Raynal wrote:
> Hi William,
> 
> william.zhang@broadcom.com wrote on Fri, 9 Jun 2023 12:16:27 -0700:
> 
>> Hi Miquel,
>>
>> On 06/09/2023 01:35 AM, Miquel Raynal wrote:
>>> Hi William,
>>>
>>> william.zhang@broadcom.com wrote on Thu, 8 Jun 2023 12:10:06 -0700:
>>>    
>>>> On 06/07/2023 11:18 PM, Miquel Raynal wrote:
>>>>> Hi William,
>>>>>
>>>>> william.zhang@broadcom.com wrote on Wed, 7 Jun 2023 13:24:23 -0700:
>>>>>     >>>> Hi Miquel,
>>>>>>
>>>>>> On 06/07/2023 01:22 AM, Miquel Raynal wrote:
>>>>>>> Hi William,
>>>>>>>
>>>>>>> william.zhang@broadcom.com wrote on Tue,  6 Jun 2023 16:12:50 -0700:
>>>>>>>      >>>> The BCMBCA broadband SoC integrates the NAND controller differently than
>>>>>>>> STB, iProc and other SoCs.  It has different endianness for NAND cache
>>>>>>>> data and ONFI parameter data.
>>>>>>>>
>>>>>>>> Add a SoC read data bus shim for BCMBCA to meet the specific SoC need
>>>>>>>> and performance improvement using the optimized memcpy function on NAND
>>>>>>>> cache memory.
>>>>>>>>
>>>>>>>> Signed-off-by: William Zhang <william.zhang@broadcom.com>
>>>>>>>> ---
>>>>>>>>
>>>>>>>>      drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c | 36 +++++++++++++++++
>>>>>>>>      drivers/mtd/nand/raw/brcmnand/brcmnand.c    | 44 ++++++++++++++-------
>>>>>>>>      drivers/mtd/nand/raw/brcmnand/brcmnand.h    |  2 +
>>>>>>>>      3 files changed, 68 insertions(+), 14 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>>>>>> index 7e48b6a0bfa2..899103a62c98 100644
>>>>>>>> --- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>>>>>> +++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
>>>>>>>> @@ -26,6 +26,18 @@ enum {
>>>>>>>>      	BCMBCA_CTLRDY		= BIT(4),
>>>>>>>>      };
>>>>>>>>      >> +#if defined(CONFIG_ARM64)
>>>>>>>> +#define ALIGN_REQ		8
>>>>>>>> +#else
>>>>>>>> +#define ALIGN_REQ		4
>>>>>>>> +#endif
>>>>>>>> +
>>>>>>>> +static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
>>>>>>>> +{
>>>>>>>> +	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
>>>>>>>> +				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>      static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
>>>>>>>>      {
>>>>>>>>      	struct bcmbca_nand_soc *priv =
>>>>>>>> @@ -56,6 +68,29 @@ static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
>>>>>>>>      	brcmnand_writel(val, mmio);
>>>>>>>>      }
>>>>>>>>      >> +static void bcmbca_read_data_bus(struct brcmnand_soc *soc,
>>>>>>>> +				 void __iomem *flash_cache,  u32 *buffer,
>>>>>>>> +				 int fc_words, bool is_param)
>>>>>>>> +{
>>>>>>>> +	int i;
>>>>>>>> +
>>>>>>>> +	if (!is_param) {
>>>>>>>> +		/*
>>>>>>>> +		 * memcpy can do unaligned aligned access depending on source
>>>>>>>> +		 * and dest address, which is incompatible with nand cache. Fallback
>>>>>>>> +		 * to the memcpy for io version
>>>>>>>> +		 */
>>>>>>>> +		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
>>>>>>>> +			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
>>>>>>>> +		else
>>>>>>>> +			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
>>>>>>>> +	} else {
>>>>>>>> +		/* Flash cache has same endian as the host for parameter pages */
>>>>>>>> +		for (i = 0; i < fc_words; i++, buffer++)
>>>>>>>> +			*buffer = __raw_readl(flash_cache + i * 4);
>>>>>>>> +	}
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>      static int bcmbca_nand_probe(struct platform_device *pdev)
>>>>>>>>      {
>>>>>>>>      	struct device *dev = &pdev->dev;
>>>>>>>> @@ -75,6 +110,7 @@ static int bcmbca_nand_probe(struct platform_device *pdev)
>>>>>>>>      >>   	soc->ctlrdy_ack = bcmbca_nand_intc_ack;
>>>>>>>>      	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
>>>>>>>> +	soc->read_data_bus = bcmbca_read_data_bus;
>>>>>>>>      >>   	return brcmnand_probe(pdev, soc);
>>>>>>>>      }
>>>>>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>>>>>> index d920e88c7f5b..656be4d73016 100644
>>>>>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>>>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
>>>>>>>> @@ -814,6 +814,30 @@ static inline u32 edu_readl(struct brcmnand_controller *ctrl,
>>>>>>>>      	return brcmnand_readl(ctrl->edu_base + offs);
>>>>>>>>      }
>>>>>>>>      >> +static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,
>>>>>>>> +					   void __iomem *flash_cache, u32 *buffer,
>>>>>>>> +					   int fc_words, bool is_param)
>>>>>>>> +{
>>>>>>>> +	struct brcmnand_soc *soc = ctrl->soc;
>>>>>>>> +	int i;
>>>>>>>> +
>>>>>>>> +	if (soc->read_data_bus) {
>>>>>>>> +		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
>>>>>>>> +	} else {
>>>>>>>> +		if (!is_param) {
>>>>>>>> +			for (i = 0; i < fc_words; i++, buffer++)
>>>>>>>> +				*buffer = brcmnand_read_fc(ctrl, i);
>>>>>>>> +		} else {
>>>>>>>> +			for (i = 0; i < fc_words; i++)
>>>>>>>> +				/*
>>>>>>>> +				 * Flash cache is big endian for parameter pages, at
>>>>>>>> +				 * least on STB SoCs
>>>>>>>> +				 */
>>>>>>>> +				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>>>>>>>> +		}
>>>>>>>> +	}
>>>>>>>
>>>>>>> Perhaps we could have a single function that is statically assigned at
>>>>>>> probe time instead of a first helper with two conditions which calls in
>>>>>>> one case another hook... This can be simplified I guess.
>>>>>>>      >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.
>>>>>
>>>>> You told me in case we would use exec_op we could avoid the param
>>>>> cache. If that's true then the whole support can be simplified.
>>>>>     >> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.
>>>>
>>>> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.
>>>
>>> I am sorry but this series is totally backwards, you're trying to guess
>>> what comes next with the 'is_param' thing, it's exactly what we are
>>> fighting against since 2017. There are plenty of ->exec_op()
>>> conversions out there, I don't believe this one will be harder. You
>>> need to convert the driver to this new API and get rid of this whole
>>> endianness non-sense to simplify a lot the driver.
>>>    
>> I am not guessing anything but just factor out the existing common nand cache read logic into the single default function(or one for page read and another for parameter read as I mentioned in another thread) and allow SoC to overrides the implementation when needed.
> 
> No, you are trying to guess what type of read the core is performing,
> either a regular data page read or a parameter page read.
> 
Okay this is what you mean by guessing. I didn't realize that ;)

>> I agree ->exec_op can possibly get rid of the parameter page read function and is the way to go. But it won't help on the page read for endianess.
> 
> You told me there is no endianess issue with the data pages, so why it
> won't help on the page read?
> 
Even with exec_op, the page read path for brcmand(chip->ecc.read_page) 
will still need brcmnand_read_page function which eventually I need per 
SoC implementation at least for bcmbca for now besides different 
endianess between SoC. For bcmbca, I also use the memcpy in the patch as 
the nand cache in bcmbca chip can handled the optimized copy code as 
long as the buffer is aligned for better performance.

>> It's not that I am against exec_op but I want to take one step a time
>> and I'd like to get these fixes
> 
> I don't see any fix here? Let me know if I am missing something but
> right now I see a new version of the controller being supported with
> its own constraints. If you are fixing existing code for already
> supported platform, then make it clear and we can discuss this. But if
> you just want to support the bcmbca flavor, then there is no risk
> mitigation involved here, and a conversion is the right step :)
> 
I mean the patch 1 to 4 in this series.

The exec_op will apply to all the five SoCs under brcmnand folder, not 
just bcmbca. It will take lot of time even just find people to 
test/debug all of them as I don't have access to other SoC and boards, 
on top of the nature of this big change.

>> and support for bcmbca soc first and
>> then work on the exec_op API to minimize the change and reduce the
>> risk.
>>
>>>>   
>>>>>>     Not sure how much this can be simplified... Or we have default
>>>>>> implementation in brcmnand.c but then there is one condition check
>>>>>> too. Page read is done at 512 bytes burst. One or two conditions
>>>>>> check outside of the per 512 bytes read loop does not sounds too bad
>>>>>> if performance is concern.
>>>>>
>>>>> It is unreadable. That is my main concern.
>>>>>     >>>>   >>>>>> +}
>>>>>>>> +
>>>>>>>>      static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
>>>>>>>>      {
>>>>>>>>      >> @@ -1811,20 +1835,11 @@ static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,
>>>>>>>>      			native_cmd == CMD_PARAMETER_CHANGE_COL) {
>>>>>>>>      		/* Copy flash cache word-wise */
>>>>>>>>      		u32 *flash_cache = (u32 *)ctrl->flash_cache;
>>>>>>>> -		int i;
>>>>>>>>      >>   		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
>>>>>>>>      >> -		/*
>>>>>>>> -		 * Must cache the FLASH_CACHE now, since changes in
>>>>>>>> -		 * SECTOR_SIZE_1K may invalidate it
>>>>>>>> -		 */
>>>>>>>> -		for (i = 0; i < FC_WORDS; i++)
>>>>>>>> -			/*
>>>>>>>> -			 * Flash cache is big endian for parameter pages, at
>>>>>>>> -			 * least on STB SoCs
>>>>>>>> -			 */
>>>>>>>> -			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
>>>>>>>> +		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
>>>>>>>> +				   FC_WORDS, true);
>>>>>>>>      >>   		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
>>>>>>>>      >> @@ -2137,7 +2152,7 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>>>>>>>      {
>>>>>>>>      	struct brcmnand_host *host = nand_get_controller_data(chip);
>>>>>>>>      	struct brcmnand_controller *ctrl = host->ctrl;
>>>>>>>> -	int i, j, ret = 0;
>>>>>>>> +	int i, ret = 0;
>>>>>>>>      >>   	brcmnand_clear_ecc_addr(ctrl);
>>>>>>>>      >> @@ -2150,8 +2165,9 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
>>>>>>>>      		if (likely(buf)) {
>>>>>>>>      			brcmnand_soc_data_bus_prepare(ctrl->soc, false);
>>>>>>>>      >> -			for (j = 0; j < FC_WORDS; j++, buf++)
>>>>>>>> -				*buf = brcmnand_read_fc(ctrl, j);
>>>>>>>> +			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
>>>>>>>> +					FC_WORDS, false);
>>>>>>>> +			buf += FC_WORDS;
>>>>>>>>      >>   			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);
>>>>>>>>      		}
>>>>>>>> diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>>>>>> index f1f93d85f50d..88819bc395f8 100644
>>>>>>>> --- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>>>>>> +++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
>>>>>>>> @@ -24,6 +24,8 @@ struct brcmnand_soc {
>>>>>>>>      	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
>>>>>>>>      	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
>>>>>>>>      				 bool is_param);
>>>>>>>> +	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
>>>>>>>> +				 u32 *buffer, int fc_words, bool is_param);
>>>>>>>>      	const struct brcmnand_io_ops *ops;
>>>>>>>>      };
>>>>>>>>      > >
>>>>>>> Thanks,
>>>>>>> Miquèl
>>>>>>>      > >
>>>>> Thanks,
>>>>> Miquèl
>>>>>     > >
>>> Thanks,
>>> Miquèl
>>>    
> 
> 
> Thanks,
> Miquèl
>
William Zhang June 12, 2023, 7:18 p.m. UTC | #14
On 06/12/2023 10:53 AM, Miquel Raynal wrote:
> Hello again,
> 
>>>>>>>> Perhaps we could have a single function that is statically assigned at
>>>>>>>> probe time instead of a first helper with two conditions which calls in
>>>>>>>> one case another hook... This can be simplified I guess.
>>>>>>>>      >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.
>>>>>>
>>>>>> You told me in case we would use exec_op we could avoid the param
>>>>>> cache. If that's true then the whole support can be simplified.
>>>>>>     >> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.
>>>>>
>>>>> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.
>>>>
>>>> I am sorry but this series is totally backwards, you're trying to guess
>>>> what comes next with the 'is_param' thing, it's exactly what we are
>>>> fighting against since 2017. There are plenty of ->exec_op()
>>>> conversions out there, I don't believe this one will be harder. You
>>>> need to convert the driver to this new API and get rid of this whole
>>>> endianness non-sense to simplify a lot the driver.
>>>>    
>>> I am not guessing anything but just factor out the existing common nand cache read logic into the single default function(or one for page read and another for parameter read as I mentioned in another thread) and allow SoC to overrides the implementation when needed.
>>
>> No, you are trying to guess what type of read the core is performing,
>> either a regular data page read or a parameter page read.
>>
>>> I agree ->exec_op can possibly get rid of the parameter page read function and is the way to go. But it won't help on the page read for endianess.
>>
>> You told me there is no endianess issue with the data pages, so why it
>> won't help on the page read?
>>
>>> It's not that I am against exec_op but I want to take one step a time
>>> and I'd like to get these fixes
>>
>> I don't see any fix here? Let me know if I am missing something but
>> right now I see a new version of the controller being supported with
>> its own constraints. If you are fixing existing code for already
>> supported platform, then make it clear and we can discuss this. But if
>> you just want to support the bcmbca flavor, then there is no risk
>> mitigation involved here, and a conversion is the right step :)
>>
> 
> I forgot to mention: the exec_op conversion is almost ready, Boris
> worked on it but he lacked the hardware so maybe you'll just need to
> revive the few patches which target your platform and do a little bit of
> debugging?
> 
> https://github.com/bbrezillon/linux/commits/nand/exec-op-conversion?after=8a3cf6fd25d5e15c6667f9e95c1fc86e4cb735e6+34&branch=nand%2Fexec-op-conversion&qualified_name=refs%2Fheads%2Fnand%2Fexec-op-conversion
> 
Yes this is the patch what our exec_op work is based on. Thanks Boris! 
The issue with patch is that performance is very slow for anything that 
rely on nand_read_page_op as the patch implementing it using the low 
level cmd and data register to transfer the data byte by byte.  I 
actually sent out email regarding this to Boris and he cc'ed you in sept 
last year. We have to use the nand parser to match the page read from 
exec_op so we can actually match and use the brcmnand_page_read fast 
path. But there are many situations that we need to match so the project 
to migrate exce_op are still work in progress just on our bcmbca chip as 
of now.  Just forward that email again to you and I appreciate it if you 
have any inputs there.  So IMHO it is just too risky and too big of 
scope to have the exec_op added to this patch series and definitively 
better to do it afterwards with a dedicated patch.

> Cheers,
> Miquèl
>
Miquel Raynal June 13, 2023, 6:42 a.m. UTC | #15
Hi William,

william.zhang@broadcom.com wrote on Mon, 12 Jun 2023 12:18:58 -0700:

> On 06/12/2023 10:53 AM, Miquel Raynal wrote:
> > Hello again,
> >   
> >>>>>>>> Perhaps we could have a single function that is statically assigned at
> >>>>>>>> probe time instead of a first helper with two conditions which calls in
> >>>>>>>> one case another hook... This can be simplified I guess.  
> >>>>>>>>      >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.  
> >>>>>>
> >>>>>> You told me in case we would use exec_op we could avoid the param
> >>>>>> cache. If that's true then the whole support can be simplified.  
> >>>>>>     >> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.  
> >>>>>
> >>>>> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.  
> >>>>
> >>>> I am sorry but this series is totally backwards, you're trying to guess
> >>>> what comes next with the 'is_param' thing, it's exactly what we are
> >>>> fighting against since 2017. There are plenty of ->exec_op()
> >>>> conversions out there, I don't believe this one will be harder. You
> >>>> need to convert the driver to this new API and get rid of this whole
> >>>> endianness non-sense to simplify a lot the driver.  
> >>>>    >>> I am not guessing anything but just factor out the existing common nand cache read logic into the single default function(or one for page read and another for parameter read as I mentioned in another thread) and allow SoC to overrides the implementation when needed.  
> >>
> >> No, you are trying to guess what type of read the core is performing,
> >> either a regular data page read or a parameter page read.
> >>  
> >>> I agree ->exec_op can possibly get rid of the parameter page read function and is the way to go. But it won't help on the page read for endianess.  
> >>
> >> You told me there is no endianess issue with the data pages, so why it
> >> won't help on the page read?
> >>  
> >>> It's not that I am against exec_op but I want to take one step a time
> >>> and I'd like to get these fixes  
> >>
> >> I don't see any fix here? Let me know if I am missing something but
> >> right now I see a new version of the controller being supported with
> >> its own constraints. If you are fixing existing code for already
> >> supported platform, then make it clear and we can discuss this. But if
> >> you just want to support the bcmbca flavor, then there is no risk
> >> mitigation involved here, and a conversion is the right step :)
> >>  
> > 
> > I forgot to mention: the exec_op conversion is almost ready, Boris
> > worked on it but he lacked the hardware so maybe you'll just need to
> > revive the few patches which target your platform and do a little bit of
> > debugging?
> > 
> > https://github.com/bbrezillon/linux/commits/nand/exec-op-conversion?after=8a3cf6fd25d5e15c6667f9e95c1fc86e4cb735e6+34&branch=nand%2Fexec-op-conversion&qualified_name=refs%2Fheads%2Fnand%2Fexec-op-conversion
> >   
> Yes this is the patch what our exec_op work is based on. Thanks Boris! The issue with patch is that performance is very slow for anything that rely on nand_read_page_op as the patch implementing it using the low level cmd and data register to transfer the data byte by byte.

You don't need to use exec_op for your read_page/write_page hooks,
quite the opposite actually. exec_op is not meant for high throughput.
exec_op is meant to be simple. You can have fast I/Os with a different
mechanism in your read/write_page hooks.

>  I actually sent out email regarding this to Boris and he cc'ed you in
>  sept last year. We have to use the nand parser to match the page read
>  from exec_op so we can actually match and use the brcmnand_page_read
>  fast path. But there are many situations that we need to match so the
>  project to migrate exce_op are still work in progress just on our
>  bcmbca chip as of now.  Just forward that email again to you and I
>  appreciate it if you have any inputs there. So IMHO it is just too
>  risky and too big of scope to have the exec_op added to this patch
>  series and definitively better to do it afterwards with a dedicated
>  patch.

As long as you add small and orthogonal changes to cmd_ctrl/cmd_func
I don't mind, but what you want now is to force me to pull dirty
changes "first", the type of change we are refusing since 2018, making
me expect you'll perform the conversion after. It would have been
terribly less dirty and you would have all your code already upstreamed
if you had performed the exec_op conversion since September.

Thanks,
Miquèl
William Zhang June 14, 2023, midnight UTC | #16
Hi Miquel,

On 06/12/2023 11:42 PM, Miquel Raynal wrote:
> Hi William,
> 
> william.zhang@broadcom.com wrote on Mon, 12 Jun 2023 12:18:58 -0700:
> 
>> On 06/12/2023 10:53 AM, Miquel Raynal wrote:
>>> Hello again,
>>>    
>>>>>>>>>> Perhaps we could have a single function that is statically assigned at
>>>>>>>>>> probe time instead of a first helper with two conditions which calls in
>>>>>>>>>> one case another hook... This can be simplified I guess.
>>>>>>>>>>       >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.
>>>>>>>>
>>>>>>>> You told me in case we would use exec_op we could avoid the param
>>>>>>>> cache. If that's true then the whole support can be simplified.
>>>>>>>>      >> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.
>>>>>>>
>>>>>>> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.
>>>>>>
>>>>>> I am sorry but this series is totally backwards, you're trying to guess
>>>>>> what comes next with the 'is_param' thing, it's exactly what we are
>>>>>> fighting against since 2017. There are plenty of ->exec_op()
>>>>>> conversions out there, I don't believe this one will be harder. You
>>>>>> need to convert the driver to this new API and get rid of this whole
>>>>>> endianness non-sense to simplify a lot the driver.
>>>>>>     >>> I am not guessing anything but just factor out the existing common nand cache read logic into the single default function(or one for page read and another for parameter read as I mentioned in another thread) and allow SoC to overrides the implementation when needed.
>>>>
>>>> No, you are trying to guess what type of read the core is performing,
>>>> either a regular data page read or a parameter page read.
>>>>   
>>>>> I agree ->exec_op can possibly get rid of the parameter page read function and is the way to go. But it won't help on the page read for endianess.
>>>>
>>>> You told me there is no endianess issue with the data pages, so why it
>>>> won't help on the page read?
>>>>   
>>>>> It's not that I am against exec_op but I want to take one step a time
>>>>> and I'd like to get these fixes
>>>>
>>>> I don't see any fix here? Let me know if I am missing something but
>>>> right now I see a new version of the controller being supported with
>>>> its own constraints. If you are fixing existing code for already
>>>> supported platform, then make it clear and we can discuss this. But if
>>>> you just want to support the bcmbca flavor, then there is no risk
>>>> mitigation involved here, and a conversion is the right step :)
>>>>   
>>>
>>> I forgot to mention: the exec_op conversion is almost ready, Boris
>>> worked on it but he lacked the hardware so maybe you'll just need to
>>> revive the few patches which target your platform and do a little bit of
>>> debugging?
>>>
>>> https://github.com/bbrezillon/linux/commits/nand/exec-op-conversion?after=8a3cf6fd25d5e15c6667f9e95c1fc86e4cb735e6+34&branch=nand%2Fexec-op-conversion&qualified_name=refs%2Fheads%2Fnand%2Fexec-op-conversion
>>>    
>> Yes this is the patch what our exec_op work is based on. Thanks Boris! The issue with patch is that performance is very slow for anything that rely on nand_read_page_op as the patch implementing it using the low level cmd and data register to transfer the data byte by byte.
> 
> You don't need to use exec_op for your read_page/write_page hooks,
> quite the opposite actually. exec_op is not meant for high throughput.
> exec_op is meant to be simple. You can have fast I/Os with a different
> mechanism in your read/write_page hooks.
> 
Right it does not impact our fast path: controller based ecc read/write. 
But things like on-chip ecc nand driver that uses exec_op API get 
impacted badly. We need to add nand op parser, several matching rules 
and other logics to use fast path page read/write instead of the low 
level data register read/write.

>>   I actually sent out email regarding this to Boris and he cc'ed you in
>>   sept last year. We have to use the nand parser to match the page read
>>   from exec_op so we can actually match and use the brcmnand_page_read
>>   fast path. But there are many situations that we need to match so the
>>   project to migrate exce_op are still work in progress just on our
>>   bcmbca chip as of now.  Just forward that email again to you and I
>>   appreciate it if you have any inputs there. So IMHO it is just too
>>   risky and too big of scope to have the exec_op added to this patch
>>   series and definitively better to do it afterwards with a dedicated
>>   patch.
> 
> As long as you add small and orthogonal changes to cmd_ctrl/cmd_func
> I don't mind, but what you want now is to force me to pull dirty
> changes "first", the type of change we are refusing since 2018, making
> me expect you'll perform the conversion after. It would have been
> terribly less dirty and you would have all your code already upstreamed
> if you had performed the exec_op conversion since September.
> 
I didn't work on open source 5 years ago. I am sorry that I missed the 
background of the rejected changes since then but I do not agree that 
this change is dirty change just because I factor out the code with 
is_param argument(and I offered an alternative to remove is_param with 
two data read functions).

I see your point with exec_op and agree that is the way to go.  We had 
an initial look of the Borris exec_op patch last Sept and noticed the 
performance issue but we haven't got the chance to actively work on 
improving the performance and prepare for up-streaming until recently. 
What if we bring in the original exec_op patch in this series so we 
don't need to add the parameter data read function(if we verify it works 
on difference SoCs without endianess)?  Or better to have exec_op as 
separate patch first and then this series?  Then we provide another 
patch to improve the performance for exec_op as this work is still in 
progress and require more testing.

> Thanks,
> Miquèl
>
Miquel Raynal June 14, 2023, 6:22 a.m. UTC | #17
Hi William,

william.zhang@broadcom.com wrote on Tue, 13 Jun 2023 17:00:19 -0700:

> Hi Miquel,
> 
> On 06/12/2023 11:42 PM, Miquel Raynal wrote:
> > Hi William,
> > 
> > william.zhang@broadcom.com wrote on Mon, 12 Jun 2023 12:18:58 -0700:
> >   
> >> On 06/12/2023 10:53 AM, Miquel Raynal wrote:  
> >>> Hello again,  
> >>>    >>>>>>>>>> Perhaps we could have a single function that is statically assigned at  
> >>>>>>>>>> probe time instead of a first helper with two conditions which calls in
> >>>>>>>>>> one case another hook... This can be simplified I guess.  
> >>>>>>>>>>       >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.  
> >>>>>>>>
> >>>>>>>> You told me in case we would use exec_op we could avoid the param
> >>>>>>>> cache. If that's true then the whole support can be simplified.  
> >>>>>>>>      >> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.  
> >>>>>>>
> >>>>>>> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.  
> >>>>>>
> >>>>>> I am sorry but this series is totally backwards, you're trying to guess
> >>>>>> what comes next with the 'is_param' thing, it's exactly what we are
> >>>>>> fighting against since 2017. There are plenty of ->exec_op()
> >>>>>> conversions out there, I don't believe this one will be harder. You
> >>>>>> need to convert the driver to this new API and get rid of this whole
> >>>>>> endianness non-sense to simplify a lot the driver.  
> >>>>>>     >>> I am not guessing anything but just factor out the existing common nand cache read logic into the single default function(or one for page read and another for parameter read as I mentioned in another thread) and allow SoC to overrides the implementation when needed.  
> >>>>
> >>>> No, you are trying to guess what type of read the core is performing,
> >>>> either a regular data page read or a parameter page read.  
> >>>>   >>>>> I agree ->exec_op can possibly get rid of the parameter page read function and is the way to go. But it won't help on the page read for endianess.  
> >>>>
> >>>> You told me there is no endianess issue with the data pages, so why it
> >>>> won't help on the page read?  
> >>>>   >>>>> It's not that I am against exec_op but I want to take one step a time  
> >>>>> and I'd like to get these fixes  
> >>>>
> >>>> I don't see any fix here? Let me know if I am missing something but
> >>>> right now I see a new version of the controller being supported with
> >>>> its own constraints. If you are fixing existing code for already
> >>>> supported platform, then make it clear and we can discuss this. But if
> >>>> you just want to support the bcmbca flavor, then there is no risk
> >>>> mitigation involved here, and a conversion is the right step :)  
> >>>>   >>>  
> >>> I forgot to mention: the exec_op conversion is almost ready, Boris
> >>> worked on it but he lacked the hardware so maybe you'll just need to
> >>> revive the few patches which target your platform and do a little bit of
> >>> debugging?
> >>>
> >>> https://github.com/bbrezillon/linux/commits/nand/exec-op-conversion?after=8a3cf6fd25d5e15c6667f9e95c1fc86e4cb735e6+34&branch=nand%2Fexec-op-conversion&qualified_name=refs%2Fheads%2Fnand%2Fexec-op-conversion  
> >>>    >> Yes this is the patch what our exec_op work is based on. Thanks Boris! The issue with patch is that performance is very slow for anything that rely on nand_read_page_op as the patch implementing it using the low level cmd and data register to transfer the data byte by byte.  
> > 
> > You don't need to use exec_op for your read_page/write_page hooks,
> > quite the opposite actually. exec_op is not meant for high throughput.
> > exec_op is meant to be simple. You can have fast I/Os with a different
> > mechanism in your read/write_page hooks.
> >   
> Right it does not impact our fast path: controller based ecc read/write. But things like on-chip ecc nand driver that uses exec_op API get impacted badly. We need to add nand op parser, several matching rules and other logics to use fast path page read/write instead of the low level data register read/write.
> 
> >>   I actually sent out email regarding this to Boris and he cc'ed you in
> >>   sept last year. We have to use the nand parser to match the page read
> >>   from exec_op so we can actually match and use the brcmnand_page_read
> >>   fast path. But there are many situations that we need to match so the
> >>   project to migrate exce_op are still work in progress just on our
> >>   bcmbca chip as of now.  Just forward that email again to you and I
> >>   appreciate it if you have any inputs there. So IMHO it is just too
> >>   risky and too big of scope to have the exec_op added to this patch
> >>   series and definitively better to do it afterwards with a dedicated
> >>   patch.  
> > 
> > As long as you add small and orthogonal changes to cmd_ctrl/cmd_func
> > I don't mind, but what you want now is to force me to pull dirty
> > changes "first", the type of change we are refusing since 2018, making
> > me expect you'll perform the conversion after. It would have been
> > terribly less dirty and you would have all your code already upstreamed
> > if you had performed the exec_op conversion since September.
> >   
> I didn't work on open source 5 years ago. I am sorry that I missed the background of the rejected changes since then but I do not agree that this change is dirty change just because I factor out the code with is_param argument(and I offered an alternative to remove is_param with two data read functions).

This _is_ dirty because you cannot know with the cmd_ctrl/cmdfunc
API whether we read a parameter page or a page of data. So your are
_guessing_. There are plenty ways of reading one of the others, the
heuristics on the controller side will _always_ be wrong. That is why
exec_op() was introduced.

> I see your point with exec_op and agree that is the way to go.  We had an initial look of the Borris exec_op patch last Sept and noticed the performance issue but we haven't got the chance to actively work on improving the performance and prepare for up-streaming until recently. What if we bring in the original exec_op patch in this series so we don't need to add the parameter data read function(if we verify it works on difference SoCs without endianess)?  Or better to have exec_op as separate patch first and then this series?

This one is my favorite:
1/ Add exec_op support
2/ Remove legacy hooks
3/ Add support for the bcmbca SoC

Then you can improve the performance for on-die ECC situations, but to
be honest this improvement looks little a very little addition. You can
take example from the existing hooks, how they match specific
operations in the parser and then hook them to specific helpers.
Nothing terribly complex, there are dozens of conversions available
now.

Good luck :)
Miquèl
William Zhang June 14, 2023, 11:52 p.m. UTC | #18
On 06/13/2023 11:22 PM, Miquel Raynal wrote:
> Hi William,
> 
> william.zhang@broadcom.com wrote on Tue, 13 Jun 2023 17:00:19 -0700:
> 
>> Hi Miquel,
>>
>> On 06/12/2023 11:42 PM, Miquel Raynal wrote:
>>> Hi William,
>>>
>>> william.zhang@broadcom.com wrote on Mon, 12 Jun 2023 12:18:58 -0700:
>>>    
>>>> On 06/12/2023 10:53 AM, Miquel Raynal wrote:
>>>>> Hello again,
>>>>>     >>>>>>>>>> Perhaps we could have a single function that is statically assigned at
>>>>>>>>>>>> probe time instead of a first helper with two conditions which calls in
>>>>>>>>>>>> one case another hook... This can be simplified I guess.
>>>>>>>>>>>>        >> Well this will need to be done at the SoC specific implementation level (bcm<xxx>_nand.c) and each SoC will need to have either general data bus read func with is_param option or data_bus_read_page, data_bus_read_param.
>>>>>>>>>>
>>>>>>>>>> You told me in case we would use exec_op we could avoid the param
>>>>>>>>>> cache. If that's true then the whole support can be simplified.
>>>>>>>>>>       >> Correct we may possibly unified the parameter data read but exec_op is long shot and we are not fully ready for that yet. It also depends on if the low level data register has endianess difference for the parameter data between difference SoCs.
>>>>>>>>>
>>>>>>>>> So I would like to push the current implementation and we can explore the exec_op option late which will be a much big and complete different implementation.
>>>>>>>>
>>>>>>>> I am sorry but this series is totally backwards, you're trying to guess
>>>>>>>> what comes next with the 'is_param' thing, it's exactly what we are
>>>>>>>> fighting against since 2017. There are plenty of ->exec_op()
>>>>>>>> conversions out there, I don't believe this one will be harder. You
>>>>>>>> need to convert the driver to this new API and get rid of this whole
>>>>>>>> endianness non-sense to simplify a lot the driver.
>>>>>>>>      >>> I am not guessing anything but just factor out the existing common nand cache read logic into the single default function(or one for page read and another for parameter read as I mentioned in another thread) and allow SoC to overrides the implementation when needed.
>>>>>>
>>>>>> No, you are trying to guess what type of read the core is performing,
>>>>>> either a regular data page read or a parameter page read.
>>>>>>    >>>>> I agree ->exec_op can possibly get rid of the parameter page read function and is the way to go. But it won't help on the page read for endianess.
>>>>>>
>>>>>> You told me there is no endianess issue with the data pages, so why it
>>>>>> won't help on the page read?
>>>>>>    >>>>> It's not that I am against exec_op but I want to take one step a time
>>>>>>> and I'd like to get these fixes
>>>>>>
>>>>>> I don't see any fix here? Let me know if I am missing something but
>>>>>> right now I see a new version of the controller being supported with
>>>>>> its own constraints. If you are fixing existing code for already
>>>>>> supported platform, then make it clear and we can discuss this. But if
>>>>>> you just want to support the bcmbca flavor, then there is no risk
>>>>>> mitigation involved here, and a conversion is the right step :)
>>>>>>    >>>
>>>>> I forgot to mention: the exec_op conversion is almost ready, Boris
>>>>> worked on it but he lacked the hardware so maybe you'll just need to
>>>>> revive the few patches which target your platform and do a little bit of
>>>>> debugging?
>>>>>
>>>>> https://github.com/bbrezillon/linux/commits/nand/exec-op-conversion?after=8a3cf6fd25d5e15c6667f9e95c1fc86e4cb735e6+34&branch=nand%2Fexec-op-conversion&qualified_name=refs%2Fheads%2Fnand%2Fexec-op-conversion
>>>>>     >> Yes this is the patch what our exec_op work is based on. Thanks Boris! The issue with patch is that performance is very slow for anything that rely on nand_read_page_op as the patch implementing it using the low level cmd and data register to transfer the data byte by byte.
>>>
>>> You don't need to use exec_op for your read_page/write_page hooks,
>>> quite the opposite actually. exec_op is not meant for high throughput.
>>> exec_op is meant to be simple. You can have fast I/Os with a different
>>> mechanism in your read/write_page hooks.
>>>    
>> Right it does not impact our fast path: controller based ecc read/write. But things like on-chip ecc nand driver that uses exec_op API get impacted badly. We need to add nand op parser, several matching rules and other logics to use fast path page read/write instead of the low level data register read/write.
>>
>>>>    I actually sent out email regarding this to Boris and he cc'ed you in
>>>>    sept last year. We have to use the nand parser to match the page read
>>>>    from exec_op so we can actually match and use the brcmnand_page_read
>>>>    fast path. But there are many situations that we need to match so the
>>>>    project to migrate exce_op are still work in progress just on our
>>>>    bcmbca chip as of now.  Just forward that email again to you and I
>>>>    appreciate it if you have any inputs there. So IMHO it is just too
>>>>    risky and too big of scope to have the exec_op added to this patch
>>>>    series and definitively better to do it afterwards with a dedicated
>>>>    patch.
>>>
>>> As long as you add small and orthogonal changes to cmd_ctrl/cmd_func
>>> I don't mind, but what you want now is to force me to pull dirty
>>> changes "first", the type of change we are refusing since 2018, making
>>> me expect you'll perform the conversion after. It would have been
>>> terribly less dirty and you would have all your code already upstreamed
>>> if you had performed the exec_op conversion since September.
>>>    
>> I didn't work on open source 5 years ago. I am sorry that I missed the background of the rejected changes since then but I do not agree that this change is dirty change just because I factor out the code with is_param argument(and I offered an alternative to remove is_param with two data read functions).
> 
> This _is_ dirty because you cannot know with the cmd_ctrl/cmdfunc
> API whether we read a parameter page or a page of data. So your are
> _guessing_. There are plenty ways of reading one of the others, the
> heuristics on the controller side will _always_ be wrong. That is why
> exec_op() was introduced.
> 
alright we have different definition of dirty ;) Understand it is not a 
preferred way to update the code in controller cmdfunc path especially 
for large change that can be done in exec_op.

>> I see your point with exec_op and agree that is the way to go.  We had an initial look of the Borris exec_op patch last Sept and noticed the performance issue but we haven't got the chance to actively work on improving the performance and prepare for up-streaming until recently. What if we bring in the original exec_op patch in this series so we don't need to add the parameter data read function(if we verify it works on difference SoCs without endianess)?  Or better to have exec_op as separate patch first and then this series?
> 
> This one is my favorite:
> 1/ Add exec_op support
> 2/ Remove legacy hooks
> 3/ Add support for the bcmbca SoC
> 
Sounds good.  We will send exec_op series for 1 and 2 then another 
series for 3.   And I will send v2 of this series to just include the 
fixes (patch 1 to patch 4) with updates based on the comments received.

> Then you can improve the performance for on-die ECC situations, but to
> be honest this improvement looks little a very little addition. You can
> take example from the existing hooks, how they match specific
> operations in the parser and then hook them to specific helpers.
> Nothing terribly complex, there are dozens of conversions available
> now.
> 
> Good luck :)
> Miquèl
>
diff mbox series

Patch

diff --git a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
index 7e48b6a0bfa2..899103a62c98 100644
--- a/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
+++ b/drivers/mtd/nand/raw/brcmnand/bcmbca_nand.c
@@ -26,6 +26,18 @@  enum {
 	BCMBCA_CTLRDY		= BIT(4),
 };
 
+#if defined(CONFIG_ARM64)
+#define ALIGN_REQ		8
+#else
+#define ALIGN_REQ		4
+#endif
+
+static inline bool bcmbca_nand_is_buf_aligned(void *flash_cache,  void *buffer)
+{
+	return IS_ALIGNED((uintptr_t)buffer, ALIGN_REQ) &&
+				IS_ALIGNED((uintptr_t)flash_cache, ALIGN_REQ);
+}
+
 static bool bcmbca_nand_intc_ack(struct brcmnand_soc *soc)
 {
 	struct bcmbca_nand_soc *priv =
@@ -56,6 +68,29 @@  static void bcmbca_nand_intc_set(struct brcmnand_soc *soc, bool en)
 	brcmnand_writel(val, mmio);
 }
 
+static void bcmbca_read_data_bus(struct brcmnand_soc *soc,
+				 void __iomem *flash_cache,  u32 *buffer,
+				 int fc_words, bool is_param)
+{
+	int i;
+
+	if (!is_param) {
+		/*
+		 * memcpy can do unaligned aligned access depending on source
+		 * and dest address, which is incompatible with nand cache. Fallback
+		 * to the memcpy for io version
+		 */
+		if (bcmbca_nand_is_buf_aligned(flash_cache, buffer))
+			memcpy((void *)buffer, (void *)flash_cache, fc_words * 4);
+		else
+			memcpy_fromio((void *)buffer, (void *)flash_cache, fc_words * 4);
+	} else {
+		/* Flash cache has same endian as the host for parameter pages */
+		for (i = 0; i < fc_words; i++, buffer++)
+			*buffer = __raw_readl(flash_cache + i * 4);
+	}
+}
+
 static int bcmbca_nand_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -75,6 +110,7 @@  static int bcmbca_nand_probe(struct platform_device *pdev)
 
 	soc->ctlrdy_ack = bcmbca_nand_intc_ack;
 	soc->ctlrdy_set_enabled = bcmbca_nand_intc_set;
+	soc->read_data_bus = bcmbca_read_data_bus;
 
 	return brcmnand_probe(pdev, soc);
 }
diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.c b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
index d920e88c7f5b..656be4d73016 100644
--- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
+++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
@@ -814,6 +814,30 @@  static inline u32 edu_readl(struct brcmnand_controller *ctrl,
 	return brcmnand_readl(ctrl->edu_base + offs);
 }
 
+static inline void brcmnand_read_data_bus(struct brcmnand_controller *ctrl,
+					   void __iomem *flash_cache, u32 *buffer,
+					   int fc_words, bool is_param)
+{
+	struct brcmnand_soc *soc = ctrl->soc;
+	int i;
+
+	if (soc->read_data_bus) {
+		soc->read_data_bus(soc, flash_cache, buffer, fc_words, is_param);
+	} else {
+		if (!is_param) {
+			for (i = 0; i < fc_words; i++, buffer++)
+				*buffer = brcmnand_read_fc(ctrl, i);
+		} else {
+			for (i = 0; i < fc_words; i++)
+				/*
+				 * Flash cache is big endian for parameter pages, at
+				 * least on STB SoCs
+				 */
+				buffer[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
+		}
+	}
+}
+
 static void brcmnand_clear_ecc_addr(struct brcmnand_controller *ctrl)
 {
 
@@ -1811,20 +1835,11 @@  static void brcmnand_cmdfunc(struct nand_chip *chip, unsigned command,
 			native_cmd == CMD_PARAMETER_CHANGE_COL) {
 		/* Copy flash cache word-wise */
 		u32 *flash_cache = (u32 *)ctrl->flash_cache;
-		int i;
 
 		brcmnand_soc_data_bus_prepare(ctrl->soc, true);
 
-		/*
-		 * Must cache the FLASH_CACHE now, since changes in
-		 * SECTOR_SIZE_1K may invalidate it
-		 */
-		for (i = 0; i < FC_WORDS; i++)
-			/*
-			 * Flash cache is big endian for parameter pages, at
-			 * least on STB SoCs
-			 */
-			flash_cache[i] = be32_to_cpu(brcmnand_read_fc(ctrl, i));
+		brcmnand_read_data_bus(ctrl, ctrl->nand_fc, flash_cache,
+				   FC_WORDS, true);
 
 		brcmnand_soc_data_bus_unprepare(ctrl->soc, true);
 
@@ -2137,7 +2152,7 @@  static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
 {
 	struct brcmnand_host *host = nand_get_controller_data(chip);
 	struct brcmnand_controller *ctrl = host->ctrl;
-	int i, j, ret = 0;
+	int i, ret = 0;
 
 	brcmnand_clear_ecc_addr(ctrl);
 
@@ -2150,8 +2165,9 @@  static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
 		if (likely(buf)) {
 			brcmnand_soc_data_bus_prepare(ctrl->soc, false);
 
-			for (j = 0; j < FC_WORDS; j++, buf++)
-				*buf = brcmnand_read_fc(ctrl, j);
+			brcmnand_read_data_bus(ctrl, ctrl->nand_fc, buf,
+					FC_WORDS, false);
+			buf += FC_WORDS;
 
 			brcmnand_soc_data_bus_unprepare(ctrl->soc, false);
 		}
diff --git a/drivers/mtd/nand/raw/brcmnand/brcmnand.h b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
index f1f93d85f50d..88819bc395f8 100644
--- a/drivers/mtd/nand/raw/brcmnand/brcmnand.h
+++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.h
@@ -24,6 +24,8 @@  struct brcmnand_soc {
 	void (*ctlrdy_set_enabled)(struct brcmnand_soc *soc, bool en);
 	void (*prepare_data_bus)(struct brcmnand_soc *soc, bool prepare,
 				 bool is_param);
+	void (*read_data_bus)(struct brcmnand_soc *soc, void __iomem *flash_cache,
+				 u32 *buffer, int fc_words, bool is_param);
 	const struct brcmnand_io_ops *ops;
 };