diff mbox series

[v2,2/2] mmc: mediatek: extend number of tuning steps

Message ID 20231128070127.27442-3-axe.yang@mediatek.com (mailing list archive)
State New, archived
Headers show
Series mmc: mediatek: add support for 64-steps tuning | expand

Commit Message

Axe Yang (杨磊) Nov. 28, 2023, 7:01 a.m. UTC
Previously, during the MSDC calibration process, a full clock cycle
actually not be covered, which in some cases didn't yield the best
results and could cause CRC errors. This problem is particularly
evident when MSDC is used as an SDIO host. In fact, MSDC support
tuning up to a maximum of 64 steps, but by default, the step number
is 32. By increase the tuning step, we are more likely to cover more
parts of a clock cycle, and get better calibration result.

To illustrate, when tuning 32 steps, if the obtained window has a hole
near the middle, like this: 0xffc07ff (hex), then the selected delay
will be the 6 (counting from right to left).

(32 <- 1)
1111 1111 1100 0000 0000 0111 11(1)1 1111

However, if we tune 64 steps, the window obtained may look like this:
0xfffffffffffc07ff. The final selected delay will be 44, which is
safer as it is further away from the hole:

(64 <- 1)
1111 ... (1)111 1111 1111 1111 1111 1100 0000 0000 0111 1111 1111

In this case, delay 6 selected through 32 steps tuning is obviously
not optimal, and this delay is closer to the hole, using it would
easily cause CRC problems.

You will need to configure property "mediatek,tuning-step" in MSDC
dts node to 64 to extend the steps.

Signed-off-by: Axe Yang <axe.yang@mediatek.com>
---
 drivers/mmc/host/mtk-sd.c | 135 +++++++++++++++++++++++++++-----------
 1 file changed, 97 insertions(+), 38 deletions(-)

Comments

AngeloGioacchino Del Regno Nov. 28, 2023, 8:53 a.m. UTC | #1
Il 28/11/23 08:01, Axe Yang ha scritto:
> Previously, during the MSDC calibration process, a full clock cycle
> actually not be covered, which in some cases didn't yield the best
> results and could cause CRC errors. This problem is particularly
> evident when MSDC is used as an SDIO host. In fact, MSDC support
> tuning up to a maximum of 64 steps, but by default, the step number
> is 32. By increase the tuning step, we are more likely to cover more
> parts of a clock cycle, and get better calibration result.
> 
> To illustrate, when tuning 32 steps, if the obtained window has a hole
> near the middle, like this: 0xffc07ff (hex), then the selected delay
> will be the 6 (counting from right to left).
> 
> (32 <- 1)
> 1111 1111 1100 0000 0000 0111 11(1)1 1111
> 
> However, if we tune 64 steps, the window obtained may look like this:
> 0xfffffffffffc07ff. The final selected delay will be 44, which is
> safer as it is further away from the hole:
> 
> (64 <- 1)
> 1111 ... (1)111 1111 1111 1111 1111 1100 0000 0000 0111 1111 1111
> 
> In this case, delay 6 selected through 32 steps tuning is obviously
> not optimal, and this delay is closer to the hole, using it would
> easily cause CRC problems.
> 
> You will need to configure property "mediatek,tuning-step" in MSDC
> dts node to 64 to extend the steps.
> 

If we can run 64 tuning steps, why should we run 32?

Why isn't it just better to *always* run 64 tuning steps, on SoCs supporting that?

Thanks,
Angelo

> Signed-off-by: Axe Yang <axe.yang@mediatek.com>
> ---
>   drivers/mmc/host/mtk-sd.c | 135 +++++++++++++++++++++++++++-----------
>   1 file changed, 97 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c
> index 97f7c3d4be6e..c8297f501a1e 100644
> --- a/drivers/mmc/host/mtk-sd.c
> +++ b/drivers/mmc/host/mtk-sd.c
> @@ -252,12 +252,16 @@
>   
>   #define MSDC_PAD_TUNE_DATWRDLY	  GENMASK(4, 0)		/* RW */
>   #define MSDC_PAD_TUNE_DATRRDLY	  GENMASK(12, 8)	/* RW */
> +#define MSDC_PAD_TUNE_DATRRDLY2	  GENMASK(12, 8)	/* RW */
>   #define MSDC_PAD_TUNE_CMDRDLY	  GENMASK(20, 16)	/* RW */
> +#define MSDC_PAD_TUNE_CMDRDLY2	  GENMASK(20, 16)	/* RW */
>   #define MSDC_PAD_TUNE_CMDRRDLY	  GENMASK(26, 22)	/* RW */
>   #define MSDC_PAD_TUNE_CLKTDLY	  GENMASK(31, 27)	/* RW */
>   #define MSDC_PAD_TUNE_RXDLYSEL	  BIT(15)   /* RW */
>   #define MSDC_PAD_TUNE_RD_SEL	  BIT(13)   /* RW */
>   #define MSDC_PAD_TUNE_CMD_SEL	  BIT(21)   /* RW */
> +#define MSDC_PAD_TUNE_RD2_SEL	  BIT(13)   /* RW */
> +#define MSDC_PAD_TUNE_CMD2_SEL	  BIT(21)   /* RW */
>   
>   #define PAD_DS_TUNE_DLY_SEL       BIT(0)	  /* RW */
>   #define PAD_DS_TUNE_DLY1	  GENMASK(6, 2)   /* RW */
> @@ -325,7 +329,8 @@
>   
>   #define DEFAULT_DEBOUNCE	(8)	/* 8 cycles CD debounce */
>   
> -#define PAD_DELAY_MAX	32 /* PAD delay cells */
> +#define PAD_DELAY_HALF	32 /* PAD delay cells */
> +#define PAD_DELAY_FULL	64
>   /*--------------------------------------------------------------------------*/
>   /* Descriptor Structure                                                     */
>   /*--------------------------------------------------------------------------*/
> @@ -461,6 +466,7 @@ struct msdc_host {
>   	u32 hs400_ds_dly3;
>   	u32 hs200_cmd_int_delay; /* cmd internal delay for HS200/SDR104 */
>   	u32 hs400_cmd_int_delay; /* cmd internal delay for HS400 */
> +	u32 tuning_step;
>   	bool hs400_cmd_resp_sel_rising;
>   				 /* cmd response sample selection for HS400 */
>   	bool hs400_mode;	/* current eMMC will run at hs400 mode */
> @@ -1615,7 +1621,7 @@ static irqreturn_t msdc_cmdq_irq(struct msdc_host *host, u32 intsts)
>   	}
>   
>   	if (cmd_err || dat_err) {
> -		dev_err(host->dev, "cmd_err = %d, dat_err =%d, intsts = 0x%x",
> +		dev_err(host->dev, "cmd_err = %d, dat_err = %d, intsts = 0x%x",
>   			cmd_err, dat_err, intsts);
>   	}
>   
> @@ -1780,10 +1786,20 @@ static void msdc_init_hw(struct msdc_host *host)
>   				     DATA_K_VALUE_SEL);
>   			sdr_set_bits(host->top_base + EMMC_TOP_CMD,
>   				     PAD_CMD_RD_RXDLY_SEL);
> +			if (host->tuning_step > PAD_DELAY_HALF) {
> +				sdr_set_bits(host->top_base + EMMC_TOP_CONTROL,
> +					     PAD_DAT_RD_RXDLY2_SEL);
> +				sdr_set_bits(host->top_base + EMMC_TOP_CMD,
> +					     PAD_CMD_RD_RXDLY2_SEL);
> +			}
>   		} else {
>   			sdr_set_bits(host->base + tune_reg,
>   				     MSDC_PAD_TUNE_RD_SEL |
>   				     MSDC_PAD_TUNE_CMD_SEL);
> +			if (host->tuning_step > PAD_DELAY_HALF)
> +				sdr_set_bits(host->base + tune_reg + 4,
> +					     MSDC_PAD_TUNE_RD2_SEL |
> +					     MSDC_PAD_TUNE_CMD2_SEL);
>   		}
>   	} else {
>   		/* choose clock tune */
> @@ -1925,24 +1941,24 @@ static void msdc_ops_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
>   		msdc_set_mclk(host, ios->timing, ios->clock);
>   }
>   
> -static u32 test_delay_bit(u32 delay, u32 bit)
> +static u64 test_delay_bit(u64 delay, u32 bit)
>   {
> -	bit %= PAD_DELAY_MAX;
> +	bit %= PAD_DELAY_FULL;
>   	return delay & BIT(bit);
>   }
>   
> -static int get_delay_len(u32 delay, u32 start_bit)
> +static int get_delay_len(u64 delay, u32 start_bit)
>   {
>   	int i;
>   
> -	for (i = 0; i < (PAD_DELAY_MAX - start_bit); i++) {
> +	for (i = 0; i < (PAD_DELAY_FULL - start_bit); i++) {
>   		if (test_delay_bit(delay, start_bit + i) == 0)
>   			return i;
>   	}
> -	return PAD_DELAY_MAX - start_bit;
> +	return PAD_DELAY_FULL - start_bit;
>   }
>   
> -static struct msdc_delay_phase get_best_delay(struct msdc_host *host, u32 delay)
> +static struct msdc_delay_phase get_best_delay(struct msdc_host *host, u64 delay)
>   {
>   	int start = 0, len = 0;
>   	int start_final = 0, len_final = 0;
> @@ -1950,28 +1966,28 @@ static struct msdc_delay_phase get_best_delay(struct msdc_host *host, u32 delay)
>   	struct msdc_delay_phase delay_phase = { 0, };
>   
>   	if (delay == 0) {
> -		dev_err(host->dev, "phase error: [map:%x]\n", delay);
> +		dev_err(host->dev, "phase error: [map:%llx]\n", delay);
>   		delay_phase.final_phase = final_phase;
>   		return delay_phase;
>   	}
>   
> -	while (start < PAD_DELAY_MAX) {
> +	while (start < PAD_DELAY_FULL) {
>   		len = get_delay_len(delay, start);
>   		if (len_final < len) {
>   			start_final = start;
>   			len_final = len;
>   		}
>   		start += len ? len : 1;
> -		if (len >= 12 && start_final < 4)
> +		if (!upper_32_bits(delay) && len >= 12 && start_final < 4)
>   			break;
>   	}
>   
>   	/* The rule is that to find the smallest delay cell */
>   	if (start_final == 0)
> -		final_phase = (start_final + len_final / 3) % PAD_DELAY_MAX;
> +		final_phase = (start_final + len_final / 3) % PAD_DELAY_FULL;
>   	else
> -		final_phase = (start_final + len_final / 2) % PAD_DELAY_MAX;
> -	dev_dbg(host->dev, "phase: [map:%x] [maxlen:%d] [final:%d]\n",
> +		final_phase = (start_final + len_final / 2) % PAD_DELAY_FULL;
> +	dev_dbg(host->dev, "phase: [map:%llx] [maxlen:%d] [final:%d]\n",
>   		delay, len_final, final_phase);
>   
>   	delay_phase.maxlen = len_final;
> @@ -1984,30 +2000,68 @@ static inline void msdc_set_cmd_delay(struct msdc_host *host, u32 value)
>   {
>   	u32 tune_reg = host->dev_comp->pad_tune_reg;
>   
> -	if (host->top_base)
> -		sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY,
> -			      value);
> -	else
> -		sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY,
> -			      value);
> +	if (host->top_base) {
> +		if (value < PAD_DELAY_HALF) {
> +			sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY,
> +				      value);
> +			sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY2,
> +				      0);
> +		} else {
> +			sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY,
> +				      PAD_DELAY_HALF - 1);
> +			sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY2,
> +				      value - PAD_DELAY_HALF);
> +		}
> +	} else {
> +		if (value < PAD_DELAY_HALF) {
> +			sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY,
> +				      value);
> +			sdr_set_field(host->base + tune_reg + 4, MSDC_PAD_TUNE_CMDRDLY2,
> +				      0);
> +		} else {
> +			sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY,
> +				      PAD_DELAY_HALF - 1);
> +			sdr_set_field(host->base + tune_reg + 4, MSDC_PAD_TUNE_CMDRDLY2,
> +				      value - PAD_DELAY_HALF);
> +		}
> +	}
>   }
>   
>   static inline void msdc_set_data_delay(struct msdc_host *host, u32 value)
>   {
>   	u32 tune_reg = host->dev_comp->pad_tune_reg;
>   
> -	if (host->top_base)
> -		sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
> -			      PAD_DAT_RD_RXDLY, value);
> -	else
> -		sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY,
> -			      value);
> +	if (host->top_base) {
> +		if (value < PAD_DELAY_HALF) {
> +			sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
> +				      PAD_DAT_RD_RXDLY, value);
> +			sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
> +				      PAD_DAT_RD_RXDLY2, 0);
> +		} else {
> +			sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
> +				      PAD_DAT_RD_RXDLY, PAD_DELAY_HALF - 1);
> +			sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
> +				      PAD_DAT_RD_RXDLY2, value - PAD_DELAY_HALF);
> +		}
> +	} else {
> +		if (value < PAD_DELAY_HALF) {
> +			sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY,
> +				      value);
> +			sdr_set_field(host->base + tune_reg + 4, MSDC_PAD_TUNE_DATRRDLY2,
> +				      0);
> +		} else {
> +			sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY,
> +				      PAD_DELAY_HALF - 1);
> +			sdr_set_field(host->base + tune_reg + 4, MSDC_PAD_TUNE_DATRRDLY2,
> +				      value - PAD_DELAY_HALF);
> +		}
> +	}
>   }
>   
>   static int msdc_tune_response(struct mmc_host *mmc, u32 opcode)
>   {
>   	struct msdc_host *host = mmc_priv(mmc);
> -	u32 rise_delay = 0, fall_delay = 0;
> +	u64 rise_delay = 0, fall_delay = 0;
>   	struct msdc_delay_phase final_rise_delay, final_fall_delay = { 0,};
>   	struct msdc_delay_phase internal_delay_phase;
>   	u8 final_delay, final_maxlen;
> @@ -2023,7 +2077,7 @@ static int msdc_tune_response(struct mmc_host *mmc, u32 opcode)
>   			      host->hs200_cmd_int_delay);
>   
>   	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
> -	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
> +	for (i = 0; i < host->tuning_step; i++) {
>   		msdc_set_cmd_delay(host, i);
>   		/*
>   		 * Using the same parameters, it may sometimes pass the test,
> @@ -2047,7 +2101,7 @@ static int msdc_tune_response(struct mmc_host *mmc, u32 opcode)
>   		goto skip_fall;
>   
>   	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
> -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> +	for (i = 0; i < host->tuning_step; i++) {
>   		msdc_set_cmd_delay(host, i);
>   		/*
>   		 * Using the same parameters, it may sometimes pass the test,
> @@ -2082,7 +2136,7 @@ static int msdc_tune_response(struct mmc_host *mmc, u32 opcode)
>   	if (host->dev_comp->async_fifo || host->hs200_cmd_int_delay)
>   		goto skip_internal;
>   
> -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> +	for (i = 0; i < host->tuning_step; i++) {
>   		sdr_set_field(host->base + tune_reg,
>   			      MSDC_PAD_TUNE_CMDRRDLY, i);
>   		mmc_send_tuning(mmc, opcode, &cmd_err);
> @@ -2121,7 +2175,8 @@ static int hs400_tune_response(struct mmc_host *mmc, u32 opcode)
>   		sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
>   	else
>   		sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
> -	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
> +
> +	for (i = 0; i < PAD_DELAY_HALF; i++) {
>   		sdr_set_field(host->base + PAD_CMD_TUNE,
>   			      PAD_CMD_TUNE_RX_DLY3, i);
>   		/*
> @@ -2151,7 +2206,7 @@ static int hs400_tune_response(struct mmc_host *mmc, u32 opcode)
>   static int msdc_tune_data(struct mmc_host *mmc, u32 opcode)
>   {
>   	struct msdc_host *host = mmc_priv(mmc);
> -	u32 rise_delay = 0, fall_delay = 0;
> +	u64 rise_delay = 0, fall_delay = 0;
>   	struct msdc_delay_phase final_rise_delay, final_fall_delay = { 0,};
>   	u8 final_delay, final_maxlen;
>   	int i, ret;
> @@ -2160,7 +2215,7 @@ static int msdc_tune_data(struct mmc_host *mmc, u32 opcode)
>   		      host->latch_ck);
>   	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL);
>   	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL);
> -	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
> +	for (i = 0; i < host->tuning_step; i++) {
>   		msdc_set_data_delay(host, i);
>   		ret = mmc_send_tuning(mmc, opcode, NULL);
>   		if (!ret)
> @@ -2174,7 +2229,7 @@ static int msdc_tune_data(struct mmc_host *mmc, u32 opcode)
>   
>   	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL);
>   	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL);
> -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> +	for (i = 0; i < host->tuning_step; i++) {
>   		msdc_set_data_delay(host, i);
>   		ret = mmc_send_tuning(mmc, opcode, NULL);
>   		if (!ret)
> @@ -2206,7 +2261,7 @@ static int msdc_tune_data(struct mmc_host *mmc, u32 opcode)
>   static int msdc_tune_together(struct mmc_host *mmc, u32 opcode)
>   {
>   	struct msdc_host *host = mmc_priv(mmc);
> -	u32 rise_delay = 0, fall_delay = 0;
> +	u64 rise_delay = 0, fall_delay = 0;
>   	struct msdc_delay_phase final_rise_delay, final_fall_delay = { 0,};
>   	u8 final_delay, final_maxlen;
>   	int i, ret;
> @@ -2217,7 +2272,7 @@ static int msdc_tune_together(struct mmc_host *mmc, u32 opcode)
>   	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_Ron the supported SoCsSPL);
>   	sdr_clr_bits(host->base + MSDC_IOCON,
>   		     MSDC_IOCON_DSPL | MSDC_IOCON_W_DSPL);
> -	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
> +	for (i = 0; i < host->tuning_step; i++) {
>   		msdc_set_cmd_delay(host, i);
>   		msdc_set_data_delay(host, i);
>   		ret = mmc_send_tuning(mmc, opcode, NULL);
> @@ -2233,7 +2288,7 @@ static int msdc_tune_together(struct mmc_host *mmc, u32 opcode)
>   	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
>   	sdr_set_bits(host->base + MSDC_IOCON,
>   		     MSDC_IOCON_DSPL | MSDC_IOCON_W_DSPL);
> -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> +	for (i = 0; i < host->tuning_step; i++) {
>   		msdc_set_cmd_delay(host, i);
>   		msdc_set_data_delay(host, i);
>   		ret = mmc_send_tuning(mmc, opcode, NULL);
> @@ -2346,7 +2401,7 @@ static int msdc_execute_hs400_tuning(struct mmc_host *mmc, struct mmc_card *card
>   	}
>   
>   	host->hs400_tuning = true;
> -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> +	for (i = 0; i < PAD_DELAY_HALF; i++) {
>   		if (host->top_base)
>   			sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE,
>   				      PAD_DS_DLY1, i);
> @@ -2601,6 +2656,10 @@ static void msdc_of_property_parse(struct platform_device *pdev,
>   	else
>   		host->hs400_cmd_resp_sel_rising = false;
>   
> +	if (of_property_read_u32(pdev->dev.of_node, "mediatek,tuning-step",
> +				 &host->tuning_step))
> +		host->tuning_step = PAD_DELAY_HALF;
> +
>   	if (of_property_read_bool(pdev->dev.of_node,
>   				  "supports-cqe"))
>   		host->cqhci = true;
Axe Yang (杨磊) Nov. 28, 2023, 9:38 a.m. UTC | #2
> On Tue, 2023-11-28 at 09:53 +0100, AngeloGioacchino Del Regno wrote:
> Il 28/11/23 08:01, Axe Yang ha scritto:
> > Previously, during the MSDC calibration process, a full clock cycle
> > actually not be covered, which in some cases didn't yield the best
> > results and could cause CRC errors. This problem is particularly
> > evident when MSDC is used as an SDIO host. In fact, MSDC support
> > tuning up to a maximum of 64 steps, but by default, the step number
> > is 32. By increase the tuning step, we are more likely to cover
> > more
> > parts of a clock cycle, and get better calibration result.
> > 
> > To illustrate, when tuning 32 steps, if the obtained window has a
> > hole
> > near the middle, like this: 0xffc07ff (hex), then the selected
> > delay
> > will be the 6 (counting from right to left).
> > 
> > (32 <- 1)
> > 1111 1111 1100 0000 0000 0111 11(1)1 1111
> > 
> > However, if we tune 64 steps, the window obtained may look like
> > this:
> > 0xfffffffffffc07ff. The final selected delay will be 44, which is
> > safer as it is further away from the hole:
> > 
> > (64 <- 1)
> > 1111 ... (1)111 1111 1111 1111 1111 1100 0000 0000 0111 1111 1111
> > 
> > In this case, delay 6 selected through 32 steps tuning is obviously
> > not optimal, and this delay is closer to the hole, using it would
> > easily cause CRC problems.
> > 
> > You will need to configure property "mediatek,tuning-step" in MSDC
> > dts node to 64 to extend the steps.
> > 
> 
> If we can run 64 tuning steps, why should we run 32?
> 
> Why isn't it just better to *always* run 64 tuning steps, on SoCs
> supporting that?
> 
> Thanks,
> Angelo

Hi Angelo,

That is a good question. The benefit of preserving 32 steps tuning is
that it can save time in certain scenarios. 

On some platforms, when the delay selected through 64 steps tuning is
very close to that chosen through 32 steps, we can reduce the tuning
step from 64 to 32. This can save time sending the tuning block
commands. 

Thus using 32 steps tuning can save kernel boot up time.

Another case where time can be saved is when accessing the RPMB
partition of eMMC. Each time switch to RPMB partition, there is a
retune action, causing a certain drop in performance. If we are certain
that the results of 32 steps tuning are usable and we use it, this can
in a sense also guarantee performance when accessing the RPMB
partition.

Regards,
Axe


> 
> > Signed-off-by: Axe Yang <axe.yang@mediatek.com>
> > ---
> >   drivers/mmc/host/mtk-sd.c | 135 +++++++++++++++++++++++++++----
> > -------
> >   1 file changed, 97 insertions(+), 38 deletions(-)
> > 
> > diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c
> > index 97f7c3d4be6e..c8297f501a1e 100644
> > --- a/drivers/mmc/host/mtk-sd.c
> > +++ b/drivers/mmc/host/mtk-sd.c
> > @@ -252,12 +252,16 @@
> >   
> >   #define MSDC_PAD_TUNE_DATWRDLY	  GENMASK(4, 0)		/*
> > RW */
> >   #define MSDC_PAD_TUNE_DATRRDLY	  GENMASK(12, 8)	/* RW */
> > +#define MSDC_PAD_TUNE_DATRRDLY2	  GENMASK(12, 8)	/* RW */
> >   #define MSDC_PAD_TUNE_CMDRDLY	  GENMASK(20, 16)	/* RW */
> > +#define MSDC_PAD_TUNE_CMDRDLY2	  GENMASK(20, 16)	/* RW */
> >   #define MSDC_PAD_TUNE_CMDRRDLY	  GENMASK(26, 22)	/* RW */
> >   #define MSDC_PAD_TUNE_CLKTDLY	  GENMASK(31, 27)	/* RW */
> >   #define MSDC_PAD_TUNE_RXDLYSEL	  BIT(15)   /* RW */
> >   #define MSDC_PAD_TUNE_RD_SEL	  BIT(13)   /* RW */
> >   #define MSDC_PAD_TUNE_CMD_SEL	  BIT(21)   /* RW */
> > +#define MSDC_PAD_TUNE_RD2_SEL	  BIT(13)   /* RW */
> > +#define MSDC_PAD_TUNE_CMD2_SEL	  BIT(21)   /* RW */
> >   
> >   #define PAD_DS_TUNE_DLY_SEL       BIT(0)	  /* RW */
> >   #define PAD_DS_TUNE_DLY1	  GENMASK(6, 2)   /* RW */
> > @@ -325,7 +329,8 @@
> >   
> >   #define DEFAULT_DEBOUNCE	(8)	/* 8 cycles CD debounce */
> >   
> > -#define PAD_DELAY_MAX	32 /* PAD delay cells */
> > +#define PAD_DELAY_HALF	32 /* PAD delay cells */
> > +#define PAD_DELAY_FULL	64
> >   /*---------------------------------------------------------------
> > -----------*/
> >   /* Descriptor
> > Structure                                                     */
> >   /*---------------------------------------------------------------
> > -----------*/
> > @@ -461,6 +466,7 @@ struct msdc_host {
> >   	u32 hs400_ds_dly3;
> >   	u32 hs200_cmd_int_delay; /* cmd internal delay for HS200/SDR104
> > */
> >   	u32 hs400_cmd_int_delay; /* cmd internal delay for HS400 */
> > +	u32 tuning_step;
> >   	bool hs400_cmd_resp_sel_rising;
> >   				 /* cmd response sample selection for
> > HS400 */
> >   	bool hs400_mode;	/* current eMMC will run at hs400 mode */
> > @@ -1615,7 +1621,7 @@ static irqreturn_t msdc_cmdq_irq(struct
> > msdc_host *host, u32 intsts)
> >   	}
> >   
> >   	if (cmd_err || dat_err) {
> > -		dev_err(host->dev, "cmd_err = %d, dat_err =%d, intsts =
> > 0x%x",
> > +		dev_err(host->dev, "cmd_err = %d, dat_err = %d, intsts
> > = 0x%x",
> >   			cmd_err, dat_err, intsts);
> >   	}
> >   
> > @@ -1780,10 +1786,20 @@ static void msdc_init_hw(struct msdc_host
> > *host)
> >   				     DATA_K_VALUE_SEL);
> >   			sdr_set_bits(host->top_base + EMMC_TOP_CMD,
> >   				     PAD_CMD_RD_RXDLY_SEL);
> > +			if (host->tuning_step > PAD_DELAY_HALF) {
> > +				sdr_set_bits(host->top_base +
> > EMMC_TOP_CONTROL,
> > +					     PAD_DAT_RD_RXDLY2_SEL);
> > +				sdr_set_bits(host->top_base +
> > EMMC_TOP_CMD,
> > +					     PAD_CMD_RD_RXDLY2_SEL);
> > +			}
> >   		} else {
> >   			sdr_set_bits(host->base + tune_reg,
> >   				     MSDC_PAD_TUNE_RD_SEL |
> >   				     MSDC_PAD_TUNE_CMD_SEL);
> > +			if (host->tuning_step > PAD_DELAY_HALF)
> > +				sdr_set_bits(host->base + tune_reg + 4,
> > +					     MSDC_PAD_TUNE_RD2_SEL |
> > +					     MSDC_PAD_TUNE_CMD2_SEL);
> >   		}
> >   	} else {
> >   		/* choose clock tune */
> > @@ -1925,24 +1941,24 @@ static void msdc_ops_set_ios(struct
> > mmc_host *mmc, struct mmc_ios *ios)
> >   		msdc_set_mclk(host, ios->timing, ios->clock);
> >   }
> >   
> > -static u32 test_delay_bit(u32 delay, u32 bit)
> > +static u64 test_delay_bit(u64 delay, u32 bit)
> >   {
> > -	bit %= PAD_DELAY_MAX;
> > +	bit %= PAD_DELAY_FULL;
> >   	return delay & BIT(bit);
> >   }
> >   
> > -static int get_delay_len(u32 delay, u32 start_bit)
> > +static int get_delay_len(u64 delay, u32 start_bit)
> >   {
> >   	int i;
> >   
> > -	for (i = 0; i < (PAD_DELAY_MAX - start_bit); i++) {
> > +	for (i = 0; i < (PAD_DELAY_FULL - start_bit); i++) {
> >   		if (test_delay_bit(delay, start_bit + i) == 0)
> >   			return i;
> >   	}
> > -	return PAD_DELAY_MAX - start_bit;
> > +	return PAD_DELAY_FULL - start_bit;
> >   }
> >   
> > -static struct msdc_delay_phase get_best_delay(struct msdc_host
> > *host, u32 delay)
> > +static struct msdc_delay_phase get_best_delay(struct msdc_host
> > *host, u64 delay)
> >   {
> >   	int start = 0, len = 0;
> >   	int start_final = 0, len_final = 0;
> > @@ -1950,28 +1966,28 @@ static struct msdc_delay_phase
> > get_best_delay(struct msdc_host *host, u32 delay)
> >   	struct msdc_delay_phase delay_phase = { 0, };
> >   
> >   	if (delay == 0) {
> > -		dev_err(host->dev, "phase error: [map:%x]\n", delay);
> > +		dev_err(host->dev, "phase error: [map:%llx]\n", delay);
> >   		delay_phase.final_phase = final_phase;
> >   		return delay_phase;
> >   	}
> >   
> > -	while (start < PAD_DELAY_MAX) {
> > +	while (start < PAD_DELAY_FULL) {
> >   		len = get_delay_len(delay, start);
> >   		if (len_final < len) {
> >   			start_final = start;
> >   			len_final = len;
> >   		}
> >   		start += len ? len : 1;
> > -		if (len >= 12 && start_final < 4)
> > +		if (!upper_32_bits(delay) && len >= 12 && start_final <
> > 4)
> >   			break;
> >   	}
> >   
> >   	/* The rule is that to find the smallest delay cell */
> >   	if (start_final == 0)
> > -		final_phase = (start_final + len_final / 3) %
> > PAD_DELAY_MAX;
> > +		final_phase = (start_final + len_final / 3) %
> > PAD_DELAY_FULL;
> >   	else
> > -		final_phase = (start_final + len_final / 2) %
> > PAD_DELAY_MAX;
> > -	dev_dbg(host->dev, "phase: [map:%x] [maxlen:%d] [final:%d]\n",
> > +		final_phase = (start_final + len_final / 2) %
> > PAD_DELAY_FULL;
> > +	dev_dbg(host->dev, "phase: [map:%llx] [maxlen:%d]
> > [final:%d]\n",
> >   		delay, len_final, final_phase);
> >   
> >   	delay_phase.maxlen = len_final;
> > @@ -1984,30 +2000,68 @@ static inline void
> > msdc_set_cmd_delay(struct msdc_host *host, u32 value)
> >   {
> >   	u32 tune_reg = host->dev_comp->pad_tune_reg;
> >   
> > -	if (host->top_base)
> > -		sdr_set_field(host->top_base + EMMC_TOP_CMD,
> > PAD_CMD_RXDLY,
> > -			      value);
> > -	else
> > -		sdr_set_field(host->base + tune_reg,
> > MSDC_PAD_TUNE_CMDRDLY,
> > -			      value);
> > +	if (host->top_base) {
> > +		if (value < PAD_DELAY_HALF) {
> > +			sdr_set_field(host->top_base + EMMC_TOP_CMD,
> > PAD_CMD_RXDLY,
> > +				      value);
> > +			sdr_set_field(host->top_base + EMMC_TOP_CMD,
> > PAD_CMD_RXDLY2,
> > +				      0);
> > +		} else {
> > +			sdr_set_field(host->top_base + EMMC_TOP_CMD,
> > PAD_CMD_RXDLY,
> > +				      PAD_DELAY_HALF - 1);
> > +			sdr_set_field(host->top_base + EMMC_TOP_CMD,
> > PAD_CMD_RXDLY2,
> > +				      value - PAD_DELAY_HALF);
> > +		}
> > +	} else {
> > +		if (value < PAD_DELAY_HALF) {
> > +			sdr_set_field(host->base + tune_reg,
> > MSDC_PAD_TUNE_CMDRDLY,
> > +				      value);
> > +			sdr_set_field(host->base + tune_reg + 4,
> > MSDC_PAD_TUNE_CMDRDLY2,
> > +				      0);
> > +		} else {
> > +			sdr_set_field(host->base + tune_reg,
> > MSDC_PAD_TUNE_CMDRDLY,
> > +				      PAD_DELAY_HALF - 1);
> > +			sdr_set_field(host->base + tune_reg + 4,
> > MSDC_PAD_TUNE_CMDRDLY2,
> > +				      value - PAD_DELAY_HALF);
> > +		}
> > +	}
> >   }
> >   
> >   static inline void msdc_set_data_delay(struct msdc_host *host,
> > u32 value)
> >   {
> >   	u32 tune_reg = host->dev_comp->pad_tune_reg;
> >   
> > -	if (host->top_base)
> > -		sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
> > -			      PAD_DAT_RD_RXDLY, value);
> > -	else
> > -		sdr_set_field(host->base + tune_reg,
> > MSDC_PAD_TUNE_DATRRDLY,
> > -			      value);
> > +	if (host->top_base) {
> > +		if (value < PAD_DELAY_HALF) {
> > +			sdr_set_field(host->top_base +
> > EMMC_TOP_CONTROL,
> > +				      PAD_DAT_RD_RXDLY, value);
> > +			sdr_set_field(host->top_base +
> > EMMC_TOP_CONTROL,
> > +				      PAD_DAT_RD_RXDLY2, 0);
> > +		} else {
> > +			sdr_set_field(host->top_base +
> > EMMC_TOP_CONTROL,
> > +				      PAD_DAT_RD_RXDLY, PAD_DELAY_HALF
> > - 1);
> > +			sdr_set_field(host->top_base +
> > EMMC_TOP_CONTROL,
> > +				      PAD_DAT_RD_RXDLY2, value -
> > PAD_DELAY_HALF);
> > +		}
> > +	} else {
> > +		if (value < PAD_DELAY_HALF) {
> > +			sdr_set_field(host->base + tune_reg,
> > MSDC_PAD_TUNE_DATRRDLY,
> > +				      value);
> > +			sdr_set_field(host->base + tune_reg + 4,
> > MSDC_PAD_TUNE_DATRRDLY2,
> > +				      0);
> > +		} else {
> > +			sdr_set_field(host->base + tune_reg,
> > MSDC_PAD_TUNE_DATRRDLY,
> > +				      PAD_DELAY_HALF - 1);
> > +			sdr_set_field(host->base + tune_reg + 4,
> > MSDC_PAD_TUNE_DATRRDLY2,
> > +				      value - PAD_DELAY_HALF);
> > +		}
> > +	}
> >   }
> >   
> >   static int msdc_tune_response(struct mmc_host *mmc, u32 opcode)
> >   {
> >   	struct msdc_host *host = mmc_priv(mmc);
> > -	u32 rise_delay = 0, fall_delay = 0;
> > +	u64 rise_delay = 0, fall_delay = 0;
> >   	struct msdc_delay_phase final_rise_delay, final_fall_delay = {
> > 0,};
> >   	struct msdc_delay_phase internal_delay_phase;
> >   	u8 final_delay, final_maxlen;
> > @@ -2023,7 +2077,7 @@ static int msdc_tune_response(struct mmc_host
> > *mmc, u32 opcode)
> >   			      host->hs200_cmd_int_delay);
> >   
> >   	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
> > -	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
> > +	for (i = 0; i < host->tuning_step; i++) {
> >   		msdc_set_cmd_delay(host, i);
> >   		/*
> >   		 * Using the same parameters, it may sometimes pass the
> > test,
> > @@ -2047,7 +2101,7 @@ static int msdc_tune_response(struct mmc_host
> > *mmc, u32 opcode)
> >   		goto skip_fall;
> >   
> >   	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
> > -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> > +	for (i = 0; i < host->tuning_step; i++) {
> >   		msdc_set_cmd_delay(host, i);
> >   		/*
> >   		 * Using the same parameters, it may sometimes pass the
> > test,
> > @@ -2082,7 +2136,7 @@ static int msdc_tune_response(struct mmc_host
> > *mmc, u32 opcode)
> >   	if (host->dev_comp->async_fifo || host->hs200_cmd_int_delay)
> >   		goto skip_internal;
> >   
> > -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> > +	for (i = 0; i < host->tuning_step; i++) {
> >   		sdr_set_field(host->base + tune_reg,
> >   			      MSDC_PAD_TUNE_CMDRRDLY, i);
> >   		mmc_send_tuning(mmc, opcode, &cmd_err);
> > @@ -2121,7 +2175,8 @@ static int hs400_tune_response(struct
> > mmc_host *mmc, u32 opcode)
> >   		sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
> >   	else
> >   		sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
> > -	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
> > +
> > +	for (i = 0; i < PAD_DELAY_HALF; i++) {
> >   		sdr_set_field(host->base + PAD_CMD_TUNE,
> >   			      PAD_CMD_TUNE_RX_DLY3, i);
> >   		/*
> > @@ -2151,7 +2206,7 @@ static int hs400_tune_response(struct
> > mmc_host *mmc, u32 opcode)
> >   static int msdc_tune_data(struct mmc_host *mmc, u32 opcode)
> >   {
> >   	struct msdc_host *host = mmc_priv(mmc);
> > -	u32 rise_delay = 0, fall_delay = 0;
> > +	u64 rise_delay = 0, fall_delay = 0;
> >   	struct msdc_delay_phase final_rise_delay, final_fall_delay = {
> > 0,};
> >   	u8 final_delay, final_maxlen;
> >   	int i, ret;
> > @@ -2160,7 +2215,7 @@ static int msdc_tune_data(struct mmc_host
> > *mmc, u32 opcode)
> >   		      host->latch_ck);
> >   	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL);
> >   	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL);
> > -	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
> > +	for (i = 0; i < host->tuning_step; i++) {
> >   		msdc_set_data_delay(host, i);
> >   		ret = mmc_send_tuning(mmc, opcode, NULL);
> >   		if (!ret)
> > @@ -2174,7 +2229,7 @@ static int msdc_tune_data(struct mmc_host
> > *mmc, u32 opcode)
> >   
> >   	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL);
> >   	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL);
> > -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> > +	for (i = 0; i < host->tuning_step; i++) {
> >   		msdc_set_data_delay(host, i);
> >   		ret = mmc_send_tuning(mmc, opcode, NULL);
> >   		if (!ret)
> > @@ -2206,7 +2261,7 @@ static int msdc_tune_data(struct mmc_host
> > *mmc, u32 opcode)
> >   static int msdc_tune_together(struct mmc_host *mmc, u32 opcode)
> >   {
> >   	struct msdc_host *host = mmc_priv(mmc);
> > -	u32 rise_delay = 0, fall_delay = 0;
> > +	u64 rise_delay = 0, fall_delay = 0;
> >   	struct msdc_delay_phase final_rise_delay, final_fall_delay = {
> > 0,};
> >   	u8 final_delay, final_maxlen;
> >   	int i, ret;
> > @@ -2217,7 +2272,7 @@ static int msdc_tune_together(struct mmc_host
> > *mmc, u32 opcode)
> >   	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_Ron the
> > supported SoCsSPL);
> >   	sdr_clr_bits(host->base + MSDC_IOCON,
> >   		     MSDC_IOCON_DSPL | MSDC_IOCON_W_DSPL);
> > -	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
> > +	for (i = 0; i < host->tuning_step; i++) {
> >   		msdc_set_cmd_delay(host, i);
> >   		msdc_set_data_delay(host, i);
> >   		ret = mmc_send_tuning(mmc, opcode, NULL);
> > @@ -2233,7 +2288,7 @@ static int msdc_tune_together(struct mmc_host
> > *mmc, u32 opcode)
> >   	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
> >   	sdr_set_bits(host->base + MSDC_IOCON,
> >   		     MSDC_IOCON_DSPL | MSDC_IOCON_W_DSPL);
> > -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> > +	for (i = 0; i < host->tuning_step; i++) {
> >   		msdc_set_cmd_delay(host, i);
> >   		msdc_set_data_delay(host, i);
> >   		ret = mmc_send_tuning(mmc, opcode, NULL);
> > @@ -2346,7 +2401,7 @@ static int msdc_execute_hs400_tuning(struct
> > mmc_host *mmc, struct mmc_card *card
> >   	}
> >   
> >   	host->hs400_tuning = true;
> > -	for (i = 0; i < PAD_DELAY_MAX; i++) {
> > +	for (i = 0; i < PAD_DELAY_HALF; i++) {
> >   		if (host->top_base)
> >   			sdr_set_field(host->top_base +
> > EMMC50_PAD_DS_TUNE,
> >   				      PAD_DS_DLY1, i);
> > @@ -2601,6 +2656,10 @@ static void msdc_of_property_parse(struct
> > platform_device *pdev,
> >   	else
> >   		host->hs400_cmd_resp_sel_rising = false;
> >   
> > +	if (of_property_read_u32(pdev->dev.of_node, "mediatek,tuning-
> > step",
> > +				 &host->tuning_step))
> > +		host->tuning_step = PAD_DELAY_HALF;
> > +
> >   	if (of_property_read_bool(pdev->dev.of_node,
> >   				  "supports-cqe"))
> >   		host->cqhci = true;
> 
> 
>
AngeloGioacchino Del Regno Nov. 28, 2023, 10:20 a.m. UTC | #3
Il 28/11/23 10:38, Axe Yang (杨磊) ha scritto:
>> On Tue, 2023-11-28 at 09:53 +0100, AngeloGioacchino Del Regno wrote:
>> Il 28/11/23 08:01, Axe Yang ha scritto:
>>> Previously, during the MSDC calibration process, a full clock cycle
>>> actually not be covered, which in some cases didn't yield the best
>>> results and could cause CRC errors. This problem is particularly
>>> evident when MSDC is used as an SDIO host. In fact, MSDC support
>>> tuning up to a maximum of 64 steps, but by default, the step number
>>> is 32. By increase the tuning step, we are more likely to cover
>>> more
>>> parts of a clock cycle, and get better calibration result.
>>>
>>> To illustrate, when tuning 32 steps, if the obtained window has a
>>> hole
>>> near the middle, like this: 0xffc07ff (hex), then the selected
>>> delay
>>> will be the 6 (counting from right to left).
>>>
>>> (32 <- 1)
>>> 1111 1111 1100 0000 0000 0111 11(1)1 1111
>>>
>>> However, if we tune 64 steps, the window obtained may look like
>>> this:
>>> 0xfffffffffffc07ff. The final selected delay will be 44, which is
>>> safer as it is further away from the hole:
>>>
>>> (64 <- 1)
>>> 1111 ... (1)111 1111 1111 1111 1111 1100 0000 0000 0111 1111 1111
>>>
>>> In this case, delay 6 selected through 32 steps tuning is obviously
>>> not optimal, and this delay is closer to the hole, using it would
>>> easily cause CRC problems.
>>>
>>> You will need to configure property "mediatek,tuning-step" in MSDC
>>> dts node to 64 to extend the steps.
>>>
>>
>> If we can run 64 tuning steps, why should we run 32?
>>
>> Why isn't it just better to *always* run 64 tuning steps, on SoCs
>> supporting that?
>>
>> Thanks,
>> Angelo
> 
> Hi Angelo,
> 
> That is a good question. The benefit of preserving 32 steps tuning is
> that it can save time in certain scenarios.
> 
> On some platforms, when the delay selected through 64 steps tuning is
> very close to that chosen through 32 steps, we can reduce the tuning
> step from 64 to 32. This can save time sending the tuning block
> commands.
> 
> Thus using 32 steps tuning can save kernel boot up time.
> 
> Another case where time can be saved is when accessing the RPMB
> partition of eMMC. Each time switch to RPMB partition, there is a
> retune action, causing a certain drop in performance. If we are certain
> that the results of 32 steps tuning are usable and we use it, this can
> in a sense also guarantee performance when accessing the RPMB
> partition.
> 

Thanks for this explanation! Though, I have some more questions...

...regarding boot up time, how much time are we talking about?

I'm asking because while now I see - and agree - on using 32-steps tuning
on eMMC to guarantee performance during RPMB access, as far as I know,
there is no RPMB partition on SD/MicroSD cards (and, of course, SDIO devices).

If the boot performance impact isn't big, as in, up to ~100 milliseconds is
not big at all (especially with async probe!), we can definitely avoid the
addition of a devicetree property for 32-steps tuning, hence use a dynamic
selection strategy such that:
  - On eMMC devices, always perform 32-steps tuning (hence no boot delay)
  - On SD cards and SDIO, always perform 64-steps tuning

Cheers,
Angelo
Axe Yang (杨磊) Nov. 29, 2023, 3:16 a.m. UTC | #4
On Tue, 2023-11-28 at 11:20 +0100, AngeloGioacchino Del Regno wrote:
> Il 28/11/23 10:38, Axe Yang (杨磊) ha scritto:
> > > On Tue, 2023-11-28 at 09:53 +0100, AngeloGioacchino Del Regno
> > > wrote:
> > > Il 28/11/23 08:01, Axe Yang ha scritto:
> > > > Previously, during the MSDC calibration process, a full clock
> > > > cycle
> > > > actually not be covered, which in some cases didn't yield the
> > > > best
> > > > results and could cause CRC errors. This problem is
> > > > particularly
> > > > evident when MSDC is used as an SDIO host. In fact, MSDC
> > > > support
> > > > tuning up to a maximum of 64 steps, but by default, the step
> > > > number
> > > > is 32. By increase the tuning step, we are more likely to cover
> > > > more
> > > > parts of a clock cycle, and get better calibration result.
> > > > 
> > > > To illustrate, when tuning 32 steps, if the obtained window has
> > > > a
> > > > hole
> > > > near the middle, like this: 0xffc07ff (hex), then the selected
> > > > delay
> > > > will be the 6 (counting from right to left).
> > > > 
> > > > (32 <- 1)
> > > > 1111 1111 1100 0000 0000 0111 11(1)1 1111
> > > > 
> > > > However, if we tune 64 steps, the window obtained may look like
> > > > this:
> > > > 0xfffffffffffc07ff. The final selected delay will be 44, which
> > > > is
> > > > safer as it is further away from the hole:
> > > > 
> > > > (64 <- 1)
> > > > 1111 ... (1)111 1111 1111 1111 1111 1100 0000 0000 0111 1111
> > > > 1111
> > > > 
> > > > In this case, delay 6 selected through 32 steps tuning is
> > > > obviously
> > > > not optimal, and this delay is closer to the hole, using it
> > > > would
> > > > easily cause CRC problems.
> > > > 
> > > > You will need to configure property "mediatek,tuning-step" in
> > > > MSDC
> > > > dts node to 64 to extend the steps.
> > > > 
> > > 
> > > If we can run 64 tuning steps, why should we run 32?
> > > 
> > > Why isn't it just better to *always* run 64 tuning steps, on SoCs
> > > supporting that?
> > > 
> > > Thanks,
> > > Angelo
> > 
> > Hi Angelo,
> > 
> > That is a good question. The benefit of preserving 32 steps tuning
> > is
> > that it can save time in certain scenarios.
> > 
> > On some platforms, when the delay selected through 64 steps tuning
> > is
> > very close to that chosen through 32 steps, we can reduce the
> > tuning
> > step from 64 to 32. This can save time sending the tuning block
> > commands.
> > 
> > Thus using 32 steps tuning can save kernel boot up time.
> > 
> > Another case where time can be saved is when accessing the RPMB
> > partition of eMMC. Each time switch to RPMB partition, there is a
> > retune action, causing a certain drop in performance. If we are
> > certain
> > that the results of 32 steps tuning are usable and we use it, this
> > can
> > in a sense also guarantee performance when accessing the RPMB
> > partition.
> > 
> 
> Thanks for this explanation! Though, I have some more questions...
> 
> ...regarding boot up time, how much time are we talking about?

Luckily, I have a platform at hand that can be used for experiments/
Below are the results from testing on this platform:

[    2.431993][T1200180] kworker/2:21: mtk-msdc bootdevice:
[name:mtk_sd&]Start tuning
[    2.434950][T1200180] kworker/2:21: mtk-msdc bootdevice:
[name:mtk_sd&]Tuning finished
[    2.435957][T1200180] kworker/2:21: mtk-msdc bootdevice:
[name:mtk_sd&]phase: [map:00000000ffffffc0] [maxlen:26] [final:19]
[    2.462375][T1200180] kworker/2:21: [name:mmc_core&]mmc0: new HS400
MMC card at address 0001
...
[    2.519863][T1300069] kworker/3:1: mtk-msdc 11250000.mmc:
[name:mtk_sd&]Start tuning
[    2.526271][T1300069] kworker/3:1: mtk-msdc 11250000.mmc:
[name:mtk_sd&]Tuning finished
[    2.527288][T1300069] kworker/3:1: mtk-msdc 11250000.mmc:
[name:mtk_sd&]phase: [map:ffffffffffff003f] [maxlen:48] [final:40]
[    2.532269][T1300069] kworker/3:1: [name:mmc_core&]mmc2: new ultra
high speed SDR104 SDIO card at address 0001

As the kernel log indicates, it took 3 ms for eMMC to tune 32 steps,
while it took about 7 ms for SDIO to tune 64 steps. I have to admit,
when it comes to saving boot up time, the benefits of reducing step
form 64 to 32 are quite minimal. Just as you said, especially when
async probe is enabled.

> 
> I'm asking because while now I see - and agree - on using 32-steps
> tuning
> on eMMC to guarantee performance during RPMB access, as far as I
> know,
> there is no RPMB partition on SD/MicroSD cards (and, of course, SDIO
> devices).
> 
> If the boot performance impact isn't big, as in, up to ~100
> milliseconds is
> not big at all (especially with async probe!), we can definitely
> avoid the
> addition of a devicetree property for 32-steps tuning, hence use a
> dynamic
> selection strategy such that:
>   - On eMMC devices, always perform 32-steps tuning (hence no boot
> delay)
>   - On SD cards and SDIO, always perform 64-steps tuning

eMMC could also potentially have CRC issue if only tune 32-steps,
albeit with a lower likelihood. The precondition for using 32-steps
tuning is that it could provide roughly the same valid results as using
64-steps tuning. So taking everything into account, controlling the
tuning step as needed through the use of dts property seems to be a
more flixible approach.

Regards,
Axe

>
AngeloGioacchino Del Regno Dec. 1, 2023, 8:53 a.m. UTC | #5
Il 29/11/23 04:16, Axe Yang (杨磊) ha scritto:
> On Tue, 2023-11-28 at 11:20 +0100, AngeloGioacchino Del Regno wrote:
>> Il 28/11/23 10:38, Axe Yang (杨磊) ha scritto:
>>>> On Tue, 2023-11-28 at 09:53 +0100, AngeloGioacchino Del Regno
>>>> wrote:
>>>> Il 28/11/23 08:01, Axe Yang ha scritto:
>>>>> Previously, during the MSDC calibration process, a full clock
>>>>> cycle
>>>>> actually not be covered, which in some cases didn't yield the
>>>>> best
>>>>> results and could cause CRC errors. This problem is
>>>>> particularly
>>>>> evident when MSDC is used as an SDIO host. In fact, MSDC
>>>>> support
>>>>> tuning up to a maximum of 64 steps, but by default, the step
>>>>> number
>>>>> is 32. By increase the tuning step, we are more likely to cover
>>>>> more
>>>>> parts of a clock cycle, and get better calibration result.
>>>>>
>>>>> To illustrate, when tuning 32 steps, if the obtained window has
>>>>> a
>>>>> hole
>>>>> near the middle, like this: 0xffc07ff (hex), then the selected
>>>>> delay
>>>>> will be the 6 (counting from right to left).
>>>>>
>>>>> (32 <- 1)
>>>>> 1111 1111 1100 0000 0000 0111 11(1)1 1111
>>>>>
>>>>> However, if we tune 64 steps, the window obtained may look like
>>>>> this:
>>>>> 0xfffffffffffc07ff. The final selected delay will be 44, which
>>>>> is
>>>>> safer as it is further away from the hole:
>>>>>
>>>>> (64 <- 1)
>>>>> 1111 ... (1)111 1111 1111 1111 1111 1100 0000 0000 0111 1111
>>>>> 1111
>>>>>
>>>>> In this case, delay 6 selected through 32 steps tuning is
>>>>> obviously
>>>>> not optimal, and this delay is closer to the hole, using it
>>>>> would
>>>>> easily cause CRC problems.
>>>>>
>>>>> You will need to configure property "mediatek,tuning-step" in
>>>>> MSDC
>>>>> dts node to 64 to extend the steps.
>>>>>
>>>>
>>>> If we can run 64 tuning steps, why should we run 32?
>>>>
>>>> Why isn't it just better to *always* run 64 tuning steps, on SoCs
>>>> supporting that?
>>>>
>>>> Thanks,
>>>> Angelo
>>>
>>> Hi Angelo,
>>>
>>> That is a good question. The benefit of preserving 32 steps tuning
>>> is
>>> that it can save time in certain scenarios.
>>>
>>> On some platforms, when the delay selected through 64 steps tuning
>>> is
>>> very close to that chosen through 32 steps, we can reduce the
>>> tuning
>>> step from 64 to 32. This can save time sending the tuning block
>>> commands.
>>>
>>> Thus using 32 steps tuning can save kernel boot up time.
>>>
>>> Another case where time can be saved is when accessing the RPMB
>>> partition of eMMC. Each time switch to RPMB partition, there is a
>>> retune action, causing a certain drop in performance. If we are
>>> certain
>>> that the results of 32 steps tuning are usable and we use it, this
>>> can
>>> in a sense also guarantee performance when accessing the RPMB
>>> partition.
>>>
>>
>> Thanks for this explanation! Though, I have some more questions...
>>
>> ...regarding boot up time, how much time are we talking about?
> 
> Luckily, I have a platform at hand that can be used for experiments/
> Below are the results from testing on this platform:
> 
> [    2.431993][T1200180] kworker/2:21: mtk-msdc bootdevice:
> [name:mtk_sd&]Start tuning
> [    2.434950][T1200180] kworker/2:21: mtk-msdc bootdevice:
> [name:mtk_sd&]Tuning finished
> [    2.435957][T1200180] kworker/2:21: mtk-msdc bootdevice:
> [name:mtk_sd&]phase: [map:00000000ffffffc0] [maxlen:26] [final:19]
> [    2.462375][T1200180] kworker/2:21: [name:mmc_core&]mmc0: new HS400
> MMC card at address 0001
> ...
> [    2.519863][T1300069] kworker/3:1: mtk-msdc 11250000.mmc:
> [name:mtk_sd&]Start tuning
> [    2.526271][T1300069] kworker/3:1: mtk-msdc 11250000.mmc:
> [name:mtk_sd&]Tuning finished
> [    2.527288][T1300069] kworker/3:1: mtk-msdc 11250000.mmc:
> [name:mtk_sd&]phase: [map:ffffffffffff003f] [maxlen:48] [final:40]
> [    2.532269][T1300069] kworker/3:1: [name:mmc_core&]mmc2: new ultra
> high speed SDR104 SDIO card at address 0001
> 
> As the kernel log indicates, it took 3 ms for eMMC to tune 32 steps,
> while it took about 7 ms for SDIO to tune 64 steps. I have to admit,
> when it comes to saving boot up time, the benefits of reducing step
> form 64 to 32 are quite minimal. Just as you said, especially when
> async probe is enabled.
> 

That's great to know, and it's *truly* nice information that you can
put into the commit message, as this completes the analysis of this
commit.

Can you mention that in the commit message for v4 of this commit?

"As per measurements taken on MT(xxxx), the tuning phase will take:
  eMMC - 32 steps: ~3ms
         64 steps: xxxx
  SDIO - 32 steps: xxxx
         64 steps: ~7ms

...but while this won't prolong boot times by any meaningful amount
of time, for eMMC, it should still be preferred to use 32 steps tuning
because otherwise we lose performance for RPMB I//O, which requires
re-tuning for each access.
"

>>
>> I'm asking because while now I see - and agree - on using 32-steps
>> tuning
>> on eMMC to guarantee performance during RPMB access, as far as I
>> know,
>> there is no RPMB partition on SD/MicroSD cards (and, of course, SDIO
>> devices).
>>
>> If the boot performance impact isn't big, as in, up to ~100
>> milliseconds is
>> not big at all (especially with async probe!), we can definitely
>> avoid the
>> addition of a devicetree property for 32-steps tuning, hence use a
>> dynamic
>> selection strategy such that:
>>    - On eMMC devices, always perform 32-steps tuning (hence no boot
>> delay)
>>    - On SD cards and SDIO, always perform 64-steps tuning
> 
> eMMC could also potentially have CRC issue if only tune 32-steps,
> albeit with a lower likelihood. The precondition for using 32-steps
> tuning is that it could provide roughly the same valid results as using
> 64-steps tuning. So taking everything into account, controlling the
> tuning step as needed through the use of dts property seems to be a
> more flixible approach.
> 

Yes but since the only performance concern is about eMMC RPMB access,
we could at least make this 64 steps as *default* for SD/SDIO, and
32 steps as default for eMMC.
Device tree would be an override of those default values.

Can we set 64 as default for SD/SDIO, 32 as default for eMMC, and then use
the device tree to override those defaults?

Cheers,
Angelo
Axe Yang (杨磊) Dec. 5, 2023, 1:23 a.m. UTC | #6
On Fri, 2023-12-01 at 09:53 +0100, AngeloGioacchino Del Regno wrote:
> Il 29/11/23 04:16, Axe Yang (杨磊) ha scritto:
> > On Tue, 2023-11-28 at 11:20 +0100, AngeloGioacchino Del Regno
> > wrote:
> > > Il 28/11/23 10:38, Axe Yang (杨磊) ha scritto:
> > > > > On Tue, 2023-11-28 at 09:53 +0100, AngeloGioacchino Del Regno
> > > > > wrote:
> > > > > Il 28/11/23 08:01, Axe Yang ha scritto:
> > > > > > Previously, during the MSDC calibration process, a full
> > > > > > clock
> > > > > > cycle
> > > > > > actually not be covered, which in some cases didn't yield
> > > > > > the
> > > > > > best
> > > > > > results and could cause CRC errors. This problem is
> > > > > > particularly
> > > > > > evident when MSDC is used as an SDIO host. In fact, MSDC
> > > > > > support
> > > > > > tuning up to a maximum of 64 steps, but by default, the
> > > > > > step
> > > > > > number
> > > > > > is 32. By increase the tuning step, we are more likely to
> > > > > > cover
> > > > > > more
> > > > > > parts of a clock cycle, and get better calibration result.
> > > > > > 
> > > > > > To illustrate, when tuning 32 steps, if the obtained window
> > > > > > has
> > > > > > a
> > > > > > hole
> > > > > > near the middle, like this: 0xffc07ff (hex), then the
> > > > > > selected
> > > > > > delay
> > > > > > will be the 6 (counting from right to left).
> > > > > > 
> > > > > > (32 <- 1)
> > > > > > 1111 1111 1100 0000 0000 0111 11(1)1 1111
> > > > > > 
> > > > > > However, if we tune 64 steps, the window obtained may look
> > > > > > like
> > > > > > this:
> > > > > > 0xfffffffffffc07ff. The final selected delay will be 44,
> > > > > > which
> > > > > > is
> > > > > > safer as it is further away from the hole:
> > > > > > 
> > > > > > (64 <- 1)
> > > > > > 1111 ... (1)111 1111 1111 1111 1111 1100 0000 0000 0111
> > > > > > 1111
> > > > > > 1111
> > > > > > 
> > > > > > In this case, delay 6 selected through 32 steps tuning is
> > > > > > obviously
> > > > > > not optimal, and this delay is closer to the hole, using it
> > > > > > would
> > > > > > easily cause CRC problems.
> > > > > > 
> > > > > > You will need to configure property "mediatek,tuning-step"
> > > > > > in
> > > > > > MSDC
> > > > > > dts node to 64 to extend the steps.
> > > > > > 
> > > > > 
> > > > > If we can run 64 tuning steps, why should we run 32?
> > > > > 
> > > > > Why isn't it just better to *always* run 64 tuning steps, on
> > > > > SoCs
> > > > > supporting that?
> > > > > 
> > > > > Thanks,
> > > > > Angelo
> > > > 
> > > > Hi Angelo,
> > > > 
> > > > That is a good question. The benefit of preserving 32 steps
> > > > tuning
> > > > is
> > > > that it can save time in certain scenarios.
> > > > 
> > > > On some platforms, when the delay selected through 64 steps
> > > > tuning
> > > > is
> > > > very close to that chosen through 32 steps, we can reduce the
> > > > tuning
> > > > step from 64 to 32. This can save time sending the tuning block
> > > > commands.
> > > > 
> > > > Thus using 32 steps tuning can save kernel boot up time.
> > > > 
> > > > Another case where time can be saved is when accessing the RPMB
> > > > partition of eMMC. Each time switch to RPMB partition, there is
> > > > a
> > > > retune action, causing a certain drop in performance. If we are
> > > > certain
> > > > that the results of 32 steps tuning are usable and we use it,
> > > > this
> > > > can
> > > > in a sense also guarantee performance when accessing the RPMB
> > > > partition.
> > > > 
> > > 
> > > Thanks for this explanation! Though, I have some more
> > > questions...
> > > 
> > > ...regarding boot up time, how much time are we talking about?
> > 
> > Luckily, I have a platform at hand that can be used for
> > experiments/
> > Below are the results from testing on this platform:
> > 
> > [    2.431993][T1200180] kworker/2:21: mtk-msdc bootdevice:
> > [name:mtk_sd&]Start tuning
> > [    2.434950][T1200180] kworker/2:21: mtk-msdc bootdevice:
> > [name:mtk_sd&]Tuning finished
> > [    2.435957][T1200180] kworker/2:21: mtk-msdc bootdevice:
> > [name:mtk_sd&]phase: [map:00000000ffffffc0] [maxlen:26] [final:19]
> > [    2.462375][T1200180] kworker/2:21: [name:mmc_core&]mmc0: new
> > HS400
> > MMC card at address 0001
> > ...
> > [    2.519863][T1300069] kworker/3:1: mtk-msdc 11250000.mmc:
> > [name:mtk_sd&]Start tuning
> > [    2.526271][T1300069] kworker/3:1: mtk-msdc 11250000.mmc:
> > [name:mtk_sd&]Tuning finished
> > [    2.527288][T1300069] kworker/3:1: mtk-msdc 11250000.mmc:
> > [name:mtk_sd&]phase: [map:ffffffffffff003f] [maxlen:48] [final:40]
> > [    2.532269][T1300069] kworker/3:1: [name:mmc_core&]mmc2: new
> > ultra
> > high speed SDR104 SDIO card at address 0001
> > 
> > As the kernel log indicates, it took 3 ms for eMMC to tune 32
> > steps,
> > while it took about 7 ms for SDIO to tune 64 steps. I have to
> > admit,
> > when it comes to saving boot up time, the benefits of reducing step
> > form 64 to 32 are quite minimal. Just as you said, especially when
> > async probe is enabled.
> > 
> 
> That's great to know, and it's *truly* nice information that you can
> put into the commit message, as this completes the analysis of this
> commit.
> 
> Can you mention that in the commit message for v4 of this commit?
> 
> "As per measurements taken on MT(xxxx), the tuning phase will take:
>   eMMC - 32 steps: ~3ms
>          64 steps: xxxx
>   SDIO - 32 steps: xxxx
>          64 steps: ~7ms
> 
Okay, will update the commit message in v4.


> ...but while this won't prolong boot times by any meaningful amount
> of time, for eMMC, it should still be preferred to use 32 steps
> tuning
> because otherwise we lose performance for RPMB I//O, which requires
> re-tuning for each access.
> "
OK, I will make SD/SDIO default to use 64 steps tuning .


> > > 
> > > I'm asking because while now I see - and agree - on using 32-
> > > steps
> > > tuning
> > > on eMMC to guarantee performance during RPMB access, as far as I
> > > know,
> > > there is no RPMB partition on SD/MicroSD cards (and, of course,
> > > SDIO
> > > devices).
> > > 
> > > If the boot performance impact isn't big, as in, up to ~100
> > > milliseconds is
> > > not big at all (especially with async probe!), we can definitely
> > > avoid the
> > > addition of a devicetree property for 32-steps tuning, hence use
> > > a
> > > dynamic
> > > selection strategy such that:
> > >    - On eMMC devices, always perform 32-steps tuning (hence no
> > > boot
> > > delay)
> > >    - On SD cards and SDIO, always perform 64-steps tuning
> > 
> > eMMC could also potentially have CRC issue if only tune 32-steps,
> > albeit with a lower likelihood. The precondition for using 32-steps
> > tuning is that it could provide roughly the same valid results as
> > using
> > 64-steps tuning. So taking everything into account, controlling the
> > tuning step as needed through the use of dts property seems to be a
> > more flixible approach.
> > 
> 
> Yes but since the only performance concern is about eMMC RPMB access,
> we could at least make this 64 steps as *default* for SD/SDIO, and
> 32 steps as default for eMMC.
> Device tree would be an override of those default values.
> 
> Can we set 64 as default for SD/SDIO, 32 as default for eMMC, and
> then use
> the device tree to override those defaults?
> 
Will update in v4.

> Angelo
diff mbox series

Patch

diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c
index 97f7c3d4be6e..c8297f501a1e 100644
--- a/drivers/mmc/host/mtk-sd.c
+++ b/drivers/mmc/host/mtk-sd.c
@@ -252,12 +252,16 @@ 
 
 #define MSDC_PAD_TUNE_DATWRDLY	  GENMASK(4, 0)		/* RW */
 #define MSDC_PAD_TUNE_DATRRDLY	  GENMASK(12, 8)	/* RW */
+#define MSDC_PAD_TUNE_DATRRDLY2	  GENMASK(12, 8)	/* RW */
 #define MSDC_PAD_TUNE_CMDRDLY	  GENMASK(20, 16)	/* RW */
+#define MSDC_PAD_TUNE_CMDRDLY2	  GENMASK(20, 16)	/* RW */
 #define MSDC_PAD_TUNE_CMDRRDLY	  GENMASK(26, 22)	/* RW */
 #define MSDC_PAD_TUNE_CLKTDLY	  GENMASK(31, 27)	/* RW */
 #define MSDC_PAD_TUNE_RXDLYSEL	  BIT(15)   /* RW */
 #define MSDC_PAD_TUNE_RD_SEL	  BIT(13)   /* RW */
 #define MSDC_PAD_TUNE_CMD_SEL	  BIT(21)   /* RW */
+#define MSDC_PAD_TUNE_RD2_SEL	  BIT(13)   /* RW */
+#define MSDC_PAD_TUNE_CMD2_SEL	  BIT(21)   /* RW */
 
 #define PAD_DS_TUNE_DLY_SEL       BIT(0)	  /* RW */
 #define PAD_DS_TUNE_DLY1	  GENMASK(6, 2)   /* RW */
@@ -325,7 +329,8 @@ 
 
 #define DEFAULT_DEBOUNCE	(8)	/* 8 cycles CD debounce */
 
-#define PAD_DELAY_MAX	32 /* PAD delay cells */
+#define PAD_DELAY_HALF	32 /* PAD delay cells */
+#define PAD_DELAY_FULL	64
 /*--------------------------------------------------------------------------*/
 /* Descriptor Structure                                                     */
 /*--------------------------------------------------------------------------*/
@@ -461,6 +466,7 @@  struct msdc_host {
 	u32 hs400_ds_dly3;
 	u32 hs200_cmd_int_delay; /* cmd internal delay for HS200/SDR104 */
 	u32 hs400_cmd_int_delay; /* cmd internal delay for HS400 */
+	u32 tuning_step;
 	bool hs400_cmd_resp_sel_rising;
 				 /* cmd response sample selection for HS400 */
 	bool hs400_mode;	/* current eMMC will run at hs400 mode */
@@ -1615,7 +1621,7 @@  static irqreturn_t msdc_cmdq_irq(struct msdc_host *host, u32 intsts)
 	}
 
 	if (cmd_err || dat_err) {
-		dev_err(host->dev, "cmd_err = %d, dat_err =%d, intsts = 0x%x",
+		dev_err(host->dev, "cmd_err = %d, dat_err = %d, intsts = 0x%x",
 			cmd_err, dat_err, intsts);
 	}
 
@@ -1780,10 +1786,20 @@  static void msdc_init_hw(struct msdc_host *host)
 				     DATA_K_VALUE_SEL);
 			sdr_set_bits(host->top_base + EMMC_TOP_CMD,
 				     PAD_CMD_RD_RXDLY_SEL);
+			if (host->tuning_step > PAD_DELAY_HALF) {
+				sdr_set_bits(host->top_base + EMMC_TOP_CONTROL,
+					     PAD_DAT_RD_RXDLY2_SEL);
+				sdr_set_bits(host->top_base + EMMC_TOP_CMD,
+					     PAD_CMD_RD_RXDLY2_SEL);
+			}
 		} else {
 			sdr_set_bits(host->base + tune_reg,
 				     MSDC_PAD_TUNE_RD_SEL |
 				     MSDC_PAD_TUNE_CMD_SEL);
+			if (host->tuning_step > PAD_DELAY_HALF)
+				sdr_set_bits(host->base + tune_reg + 4,
+					     MSDC_PAD_TUNE_RD2_SEL |
+					     MSDC_PAD_TUNE_CMD2_SEL);
 		}
 	} else {
 		/* choose clock tune */
@@ -1925,24 +1941,24 @@  static void msdc_ops_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 		msdc_set_mclk(host, ios->timing, ios->clock);
 }
 
-static u32 test_delay_bit(u32 delay, u32 bit)
+static u64 test_delay_bit(u64 delay, u32 bit)
 {
-	bit %= PAD_DELAY_MAX;
+	bit %= PAD_DELAY_FULL;
 	return delay & BIT(bit);
 }
 
-static int get_delay_len(u32 delay, u32 start_bit)
+static int get_delay_len(u64 delay, u32 start_bit)
 {
 	int i;
 
-	for (i = 0; i < (PAD_DELAY_MAX - start_bit); i++) {
+	for (i = 0; i < (PAD_DELAY_FULL - start_bit); i++) {
 		if (test_delay_bit(delay, start_bit + i) == 0)
 			return i;
 	}
-	return PAD_DELAY_MAX - start_bit;
+	return PAD_DELAY_FULL - start_bit;
 }
 
-static struct msdc_delay_phase get_best_delay(struct msdc_host *host, u32 delay)
+static struct msdc_delay_phase get_best_delay(struct msdc_host *host, u64 delay)
 {
 	int start = 0, len = 0;
 	int start_final = 0, len_final = 0;
@@ -1950,28 +1966,28 @@  static struct msdc_delay_phase get_best_delay(struct msdc_host *host, u32 delay)
 	struct msdc_delay_phase delay_phase = { 0, };
 
 	if (delay == 0) {
-		dev_err(host->dev, "phase error: [map:%x]\n", delay);
+		dev_err(host->dev, "phase error: [map:%llx]\n", delay);
 		delay_phase.final_phase = final_phase;
 		return delay_phase;
 	}
 
-	while (start < PAD_DELAY_MAX) {
+	while (start < PAD_DELAY_FULL) {
 		len = get_delay_len(delay, start);
 		if (len_final < len) {
 			start_final = start;
 			len_final = len;
 		}
 		start += len ? len : 1;
-		if (len >= 12 && start_final < 4)
+		if (!upper_32_bits(delay) && len >= 12 && start_final < 4)
 			break;
 	}
 
 	/* The rule is that to find the smallest delay cell */
 	if (start_final == 0)
-		final_phase = (start_final + len_final / 3) % PAD_DELAY_MAX;
+		final_phase = (start_final + len_final / 3) % PAD_DELAY_FULL;
 	else
-		final_phase = (start_final + len_final / 2) % PAD_DELAY_MAX;
-	dev_dbg(host->dev, "phase: [map:%x] [maxlen:%d] [final:%d]\n",
+		final_phase = (start_final + len_final / 2) % PAD_DELAY_FULL;
+	dev_dbg(host->dev, "phase: [map:%llx] [maxlen:%d] [final:%d]\n",
 		delay, len_final, final_phase);
 
 	delay_phase.maxlen = len_final;
@@ -1984,30 +2000,68 @@  static inline void msdc_set_cmd_delay(struct msdc_host *host, u32 value)
 {
 	u32 tune_reg = host->dev_comp->pad_tune_reg;
 
-	if (host->top_base)
-		sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY,
-			      value);
-	else
-		sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY,
-			      value);
+	if (host->top_base) {
+		if (value < PAD_DELAY_HALF) {
+			sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY,
+				      value);
+			sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY2,
+				      0);
+		} else {
+			sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY,
+				      PAD_DELAY_HALF - 1);
+			sdr_set_field(host->top_base + EMMC_TOP_CMD, PAD_CMD_RXDLY2,
+				      value - PAD_DELAY_HALF);
+		}
+	} else {
+		if (value < PAD_DELAY_HALF) {
+			sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY,
+				      value);
+			sdr_set_field(host->base + tune_reg + 4, MSDC_PAD_TUNE_CMDRDLY2,
+				      0);
+		} else {
+			sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRDLY,
+				      PAD_DELAY_HALF - 1);
+			sdr_set_field(host->base + tune_reg + 4, MSDC_PAD_TUNE_CMDRDLY2,
+				      value - PAD_DELAY_HALF);
+		}
+	}
 }
 
 static inline void msdc_set_data_delay(struct msdc_host *host, u32 value)
 {
 	u32 tune_reg = host->dev_comp->pad_tune_reg;
 
-	if (host->top_base)
-		sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
-			      PAD_DAT_RD_RXDLY, value);
-	else
-		sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY,
-			      value);
+	if (host->top_base) {
+		if (value < PAD_DELAY_HALF) {
+			sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
+				      PAD_DAT_RD_RXDLY, value);
+			sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
+				      PAD_DAT_RD_RXDLY2, 0);
+		} else {
+			sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
+				      PAD_DAT_RD_RXDLY, PAD_DELAY_HALF - 1);
+			sdr_set_field(host->top_base + EMMC_TOP_CONTROL,
+				      PAD_DAT_RD_RXDLY2, value - PAD_DELAY_HALF);
+		}
+	} else {
+		if (value < PAD_DELAY_HALF) {
+			sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY,
+				      value);
+			sdr_set_field(host->base + tune_reg + 4, MSDC_PAD_TUNE_DATRRDLY2,
+				      0);
+		} else {
+			sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_DATRRDLY,
+				      PAD_DELAY_HALF - 1);
+			sdr_set_field(host->base + tune_reg + 4, MSDC_PAD_TUNE_DATRRDLY2,
+				      value - PAD_DELAY_HALF);
+		}
+	}
 }
 
 static int msdc_tune_response(struct mmc_host *mmc, u32 opcode)
 {
 	struct msdc_host *host = mmc_priv(mmc);
-	u32 rise_delay = 0, fall_delay = 0;
+	u64 rise_delay = 0, fall_delay = 0;
 	struct msdc_delay_phase final_rise_delay, final_fall_delay = { 0,};
 	struct msdc_delay_phase internal_delay_phase;
 	u8 final_delay, final_maxlen;
@@ -2023,7 +2077,7 @@  static int msdc_tune_response(struct mmc_host *mmc, u32 opcode)
 			      host->hs200_cmd_int_delay);
 
 	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
-	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
+	for (i = 0; i < host->tuning_step; i++) {
 		msdc_set_cmd_delay(host, i);
 		/*
 		 * Using the same parameters, it may sometimes pass the test,
@@ -2047,7 +2101,7 @@  static int msdc_tune_response(struct mmc_host *mmc, u32 opcode)
 		goto skip_fall;
 
 	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
-	for (i = 0; i < PAD_DELAY_MAX; i++) {
+	for (i = 0; i < host->tuning_step; i++) {
 		msdc_set_cmd_delay(host, i);
 		/*
 		 * Using the same parameters, it may sometimes pass the test,
@@ -2082,7 +2136,7 @@  static int msdc_tune_response(struct mmc_host *mmc, u32 opcode)
 	if (host->dev_comp->async_fifo || host->hs200_cmd_int_delay)
 		goto skip_internal;
 
-	for (i = 0; i < PAD_DELAY_MAX; i++) {
+	for (i = 0; i < host->tuning_step; i++) {
 		sdr_set_field(host->base + tune_reg,
 			      MSDC_PAD_TUNE_CMDRRDLY, i);
 		mmc_send_tuning(mmc, opcode, &cmd_err);
@@ -2121,7 +2175,8 @@  static int hs400_tune_response(struct mmc_host *mmc, u32 opcode)
 		sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
 	else
 		sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
-	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
+
+	for (i = 0; i < PAD_DELAY_HALF; i++) {
 		sdr_set_field(host->base + PAD_CMD_TUNE,
 			      PAD_CMD_TUNE_RX_DLY3, i);
 		/*
@@ -2151,7 +2206,7 @@  static int hs400_tune_response(struct mmc_host *mmc, u32 opcode)
 static int msdc_tune_data(struct mmc_host *mmc, u32 opcode)
 {
 	struct msdc_host *host = mmc_priv(mmc);
-	u32 rise_delay = 0, fall_delay = 0;
+	u64 rise_delay = 0, fall_delay = 0;
 	struct msdc_delay_phase final_rise_delay, final_fall_delay = { 0,};
 	u8 final_delay, final_maxlen;
 	int i, ret;
@@ -2160,7 +2215,7 @@  static int msdc_tune_data(struct mmc_host *mmc, u32 opcode)
 		      host->latch_ck);
 	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL);
 	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL);
-	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
+	for (i = 0; i < host->tuning_step; i++) {
 		msdc_set_data_delay(host, i);
 		ret = mmc_send_tuning(mmc, opcode, NULL);
 		if (!ret)
@@ -2174,7 +2229,7 @@  static int msdc_tune_data(struct mmc_host *mmc, u32 opcode)
 
 	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_DSPL);
 	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_W_DSPL);
-	for (i = 0; i < PAD_DELAY_MAX; i++) {
+	for (i = 0; i < host->tuning_step; i++) {
 		msdc_set_data_delay(host, i);
 		ret = mmc_send_tuning(mmc, opcode, NULL);
 		if (!ret)
@@ -2206,7 +2261,7 @@  static int msdc_tune_data(struct mmc_host *mmc, u32 opcode)
 static int msdc_tune_together(struct mmc_host *mmc, u32 opcode)
 {
 	struct msdc_host *host = mmc_priv(mmc);
-	u32 rise_delay = 0, fall_delay = 0;
+	u64 rise_delay = 0, fall_delay = 0;
 	struct msdc_delay_phase final_rise_delay, final_fall_delay = { 0,};
 	u8 final_delay, final_maxlen;
 	int i, ret;
@@ -2217,7 +2272,7 @@  static int msdc_tune_together(struct mmc_host *mmc, u32 opcode)
 	sdr_clr_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
 	sdr_clr_bits(host->base + MSDC_IOCON,
 		     MSDC_IOCON_DSPL | MSDC_IOCON_W_DSPL);
-	for (i = 0 ; i < PAD_DELAY_MAX; i++) {
+	for (i = 0; i < host->tuning_step; i++) {
 		msdc_set_cmd_delay(host, i);
 		msdc_set_data_delay(host, i);
 		ret = mmc_send_tuning(mmc, opcode, NULL);
@@ -2233,7 +2288,7 @@  static int msdc_tune_together(struct mmc_host *mmc, u32 opcode)
 	sdr_set_bits(host->base + MSDC_IOCON, MSDC_IOCON_RSPL);
 	sdr_set_bits(host->base + MSDC_IOCON,
 		     MSDC_IOCON_DSPL | MSDC_IOCON_W_DSPL);
-	for (i = 0; i < PAD_DELAY_MAX; i++) {
+	for (i = 0; i < host->tuning_step; i++) {
 		msdc_set_cmd_delay(host, i);
 		msdc_set_data_delay(host, i);
 		ret = mmc_send_tuning(mmc, opcode, NULL);
@@ -2346,7 +2401,7 @@  static int msdc_execute_hs400_tuning(struct mmc_host *mmc, struct mmc_card *card
 	}
 
 	host->hs400_tuning = true;
-	for (i = 0; i < PAD_DELAY_MAX; i++) {
+	for (i = 0; i < PAD_DELAY_HALF; i++) {
 		if (host->top_base)
 			sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE,
 				      PAD_DS_DLY1, i);
@@ -2601,6 +2656,10 @@  static void msdc_of_property_parse(struct platform_device *pdev,
 	else
 		host->hs400_cmd_resp_sel_rising = false;
 
+	if (of_property_read_u32(pdev->dev.of_node, "mediatek,tuning-step",
+				 &host->tuning_step))
+		host->tuning_step = PAD_DELAY_HALF;
+
 	if (of_property_read_bool(pdev->dev.of_node,
 				  "supports-cqe"))
 		host->cqhci = true;