diff mbox series

[V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which otherwise could return a sub-optimal clock rate.

Message ID 1656496841-5853-1-git-send-email-quic_vnivarth@quicinc.com (mailing list archive)
State Superseded
Headers show
Series [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which otherwise could return a sub-optimal clock rate. | expand

Commit Message

Vijaya Krishna Nivarthi June 29, 2022, 10 a.m. UTC
In the logic around call to clk_round_rate(), for some corner conditions,
get_clk_div_rate() could return an sub-optimal clock rate. Also, if an
exact clock rate was not found lowest clock was being returned.

Search for suitable clock rate in 2 steps
a) exact match or within 2% tolerance
b) within 5% tolerance
This also takes care of corner conditions.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: c2194bc999d4 ("tty: serial: qcom-geni-serial: Remove uart frequency table. Instead, find suitable frequency with call to clk_round_rate")
Signed-off-by: Vijaya Krishna Nivarthi <quic_vnivarth@quicinc.com>
---
v2: removed minor optimisations to make more readable
v1: intial patch contained slightly complicated logic
---
 drivers/tty/serial/qcom_geni_serial.c | 122 +++++++++++++++++++++++++---------
 1 file changed, 90 insertions(+), 32 deletions(-)

Comments

Doug Anderson June 29, 2022, 11:15 p.m. UTC | #1
Hi,

On Wed, Jun 29, 2022 at 3:01 AM Vijaya Krishna Nivarthi
<quic_vnivarth@quicinc.com> wrote:
>
> In the logic around call to clk_round_rate(), for some corner conditions,
> get_clk_div_rate() could return an sub-optimal clock rate. Also, if an
> exact clock rate was not found lowest clock was being returned.
>
> Search for suitable clock rate in 2 steps
> a) exact match or within 2% tolerance
> b) within 5% tolerance
> This also takes care of corner conditions.
>
> Reported-by: kernel test robot <lkp@intel.com>
> Fixes: c2194bc999d4 ("tty: serial: qcom-geni-serial: Remove uart frequency table. Instead, find suitable frequency with call to clk_round_rate")
> Signed-off-by: Vijaya Krishna Nivarthi <quic_vnivarth@quicinc.com>
> ---
> v2: removed minor optimisations to make more readable
> v1: intial patch contained slightly complicated logic
> ---
>  drivers/tty/serial/qcom_geni_serial.c | 122 +++++++++++++++++++++++++---------
>  1 file changed, 90 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
> index 2e23b65..d0696d1 100644
> --- a/drivers/tty/serial/qcom_geni_serial.c
> +++ b/drivers/tty/serial/qcom_geni_serial.c
> @@ -943,52 +943,111 @@ static int qcom_geni_serial_startup(struct uart_port *uport)
>         return 0;
>  }
>
> -static unsigned long get_clk_div_rate(struct clk *clk, unsigned int baud,
> -                       unsigned int sampling_rate, unsigned int *clk_div)
> +static unsigned long find_clk_rate_in_tol(struct clk *clk, unsigned int desired_clk,
> +                       unsigned int *clk_div, unsigned int percent_tol, bool exact_match)
>  {
> +       unsigned long freq;
> +       unsigned long div, maxdiv, new_div;
> +       u64 mult;
>         unsigned long ser_clk;
> -       unsigned long desired_clk;
> -       unsigned long freq, prev;
> -       unsigned long div, maxdiv;
> -       int64_t mult;
> -
> -       desired_clk = baud * sampling_rate;
> -       if (!desired_clk) {
> -               pr_err("%s: Invalid frequency\n", __func__);
> -               return 0;
> -       }
> +       unsigned long test_freq, offset, new_freq;
>
> +       ser_clk = 0;
>         maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
> -       prev = 0;
> +       div = 1;
>
> -       for (div = 1; div <= maxdiv; div++) {
> -               mult = div * desired_clk;
> -               if (mult > ULONG_MAX)
> +       while (div <= maxdiv) {
> +               mult = (u64)div * desired_clk;
> +               if (mult != (unsigned long)mult)
>                         break;
>
> -               freq = clk_round_rate(clk, (unsigned long)mult);
> +               /*
> +                * Loop requesting a freq within tolerance and possibly exact freq.
> +                *
> +                * We'll keep track of the lowest freq inexact match we found
> +                * but always try to find a perfect match. NOTE: this algorithm
> +                * could miss a slightly better freq if there's more than one
> +                * freq between (freq - offset) and (freq) but (freq) can't be made
> +                * exactly, but that's OK.
> +                *
> +                * This absolutely relies on the fact that the Qualcomm clock
> +                * driver always rounds up.
> +                * We make use of exact_match as an I/O param.
> +                */
> +
> +               /* look only for exact match if within tolerance is already found */
> +               if (ser_clk)
> +                       offset = 0;
> +               else
> +                       offset = div_u64(mult * percent_tol, 100);
> +
> +               test_freq = mult - offset;
> +               freq = clk_round_rate(clk, test_freq);
> +
> +               /*
> +                * A dead-on freq is an insta-win
> +                */
>                 if (!(freq % desired_clk)) {
>                         ser_clk = freq;
> -                       break;
> +                       *clk_div = freq / desired_clk;
> +                       return ser_clk;
>                 }
>
> -               if (!prev)
> -                       ser_clk = freq;
> -               else if (prev == freq)
> -                       break;
> +               if (!ser_clk) {
> +                       new_div = DIV_ROUND_CLOSEST(freq, desired_clk);
> +                       new_freq = new_div * desired_clk;
> +                       offset = (new_freq * percent_tol) / 100;

Can't you overflow in the above calculation? If "percent_tol" is 5
then anything over ~859 MHz would overflow. I guess it's not likely,
but since you take so much care elsewhere... Mabye this should be:

offset = div_u64((u64)new_freq * percent_tol, 100)


> +
> +                       if (new_freq - offset <= freq && freq <= new_freq + offset) {

This whole algorithm is predicated on clk_round_rate() only ever
rounding up. ...so you don't need to check if the clock is too low,
only if the clock is too high. Well, at least after you move the
"break" condition below to right after the clk_round_rate().


> +                               /* Save the first (lowest freq) within tolerance */
> +                               ser_clk = freq;
> +                               *clk_div = new_div;
> +                               /* no more search for exact match required in 2nd run */
> +                               if (!exact_match)
> +                                       break;
> +                       }
> +               }
>
> -               prev = freq;
> +               div = freq / desired_clk + 1;

Can't you infinite loop now?

Start with:

desired_clk = 10000
div = 1
percent_tol = 2


Now:

mult = 10000
offset = 200
test_freq = 9800
freq = 9800
div = 9800 / 10000 + 1 = 0 + 1 = 1

...and then you'll loop again with "div = 1", won't you? ...or did I
get something wrong in my analysis? This is the reason my proposed
algorithm had two loops.


> +               /*
> +                * Only time clock framework doesn't round up is if
> +                * we're past the max clock rate. We're done searching
> +                * if that's the case.
> +                */
> +               if (freq < test_freq)
> +                       break;

Why did you move this test to the end? It should be right after the
clk_round_rate(). If clk_round_rate() ever returns something lower
than the clock you asked for (which is the minimum tolerance that
we'll accept) then we can just bail out right away.


>         }
>
> -       if (!ser_clk) {
> -               pr_err("%s: Can't find matching DFS entry for baud %d\n",
> -                                                               __func__, baud);
> -               return ser_clk;
> +       return ser_clk;
> +}
> +
> +static unsigned long get_clk_div_rate(struct clk *clk, unsigned int baud,
> +                       unsigned int sampling_rate, unsigned int *clk_div)
> +{
> +       unsigned long ser_clk;
> +       unsigned long desired_clk;
> +
> +       desired_clk = baud * sampling_rate;
> +       if (!desired_clk) {
> +               pr_err("%s: Invalid frequency\n", __func__);
> +               return 0;
>         }
>
> -       *clk_div = ser_clk / desired_clk;
> -       if (!(*clk_div))
> -               *clk_div = 1;
> +       ser_clk = 0;

Get rid of this init of ser_clk to 0. It doesn't do anything.


> +       /*
> +        * try to find exact clock rate or within 2% tolerance,
> +        * then within 5% tolerance
> +        */
> +       ser_clk = find_clk_rate_in_tol(clk, desired_clk, clk_div, 2, true);
> +       if (!ser_clk)
> +               ser_clk = find_clk_rate_in_tol(clk, desired_clk, clk_div, 5, false);
> +
> +       if (!ser_clk)
> +               pr_err("Couldn't find suitable clock rate for %d\n", desired_clk);
> +       else
> +               pr_debug("desired_clk-%d, ser_clk-%d, clk_div-%d\n",
> +                       desired_clk, ser_clk, *clk_div);
>
>         return ser_clk;
>  }
> @@ -1021,8 +1080,7 @@ static void qcom_geni_serial_set_termios(struct uart_port *uport,
>         if (ver >= QUP_SE_VERSION_2_5)
>                 sampling_rate /= 2;
>
> -       clk_rate = get_clk_div_rate(port->se.clk, baud,
> -               sampling_rate, &clk_div);
> +       clk_rate = get_clk_div_rate(port->se.clk, baud, sampling_rate, &clk_div);
>         if (!clk_rate)
>                 goto out_restart_rx;
>
> --
> Qualcomm INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by the Linux Foundation.
>
Greg KH June 30, 2022, 3 p.m. UTC | #2
On Wed, Jun 29, 2022 at 03:30:41PM +0530, Vijaya Krishna Nivarthi wrote:
> In the logic around call to clk_round_rate(), for some corner conditions,
> get_clk_div_rate() could return an sub-optimal clock rate. Also, if an
> exact clock rate was not found lowest clock was being returned.
> 
> Search for suitable clock rate in 2 steps
> a) exact match or within 2% tolerance
> b) within 5% tolerance
> This also takes care of corner conditions.
> 
> Reported-by: kernel test robot <lkp@intel.com>

Did the test robot really report the original issue, or just the v2
change?

thanks,

greg k-h
Vijaya Krishna Nivarthi June 30, 2022, 5:19 p.m. UTC | #3
> -----Original Message-----
> From: Greg KH <gregkh@linuxfoundation.org>
> Sent: Thursday, June 30, 2022 8:31 PM
> To: Vijaya Krishna Nivarthi (Temp) (QUIC) <quic_vnivarth@quicinc.com>
> Cc: agross@kernel.org; bjorn.andersson@linaro.org;
> konrad.dybcio@somainline.org; jirislaby@kernel.org; linux-arm-
> msm@vger.kernel.org; linux-serial@vger.kernel.org; linux-
> kernel@vger.kernel.org; Mukesh Savaliya (QUIC)
> <quic_msavaliy@quicinc.com>; dianders@chromium.org;
> mka@chromium.org; swboyd@chromium.org
> Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which
> otherwise could return a sub-optimal clock rate.
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> On Wed, Jun 29, 2022 at 03:30:41PM +0530, Vijaya Krishna Nivarthi wrote:
> > In the logic around call to clk_round_rate(), for some corner
> > conditions,
> > get_clk_div_rate() could return an sub-optimal clock rate. Also, if an
> > exact clock rate was not found lowest clock was being returned.
> >
> > Search for suitable clock rate in 2 steps
> > a) exact match or within 2% tolerance
> > b) within 5% tolerance
> > This also takes care of corner conditions.
> >
> > Reported-by: kernel test robot <lkp@intel.com>
> 
> Did the test robot really report the original issue, or just the v2 change?
> 
> thanks,
> 
> greg k-h

Test robot raised error for v1 patch and (I think) it got addressed in v2 with call to div_u64.
V2 doesn't have this error but other warnings which I am addressing along with other feedback.
Below is the error raised for v1.
Thank you,
Vijay/


All errors (new ones prefixed by >>):

   arm-linux-gnueabi-ld: arm-linux-gnueabi-ld: DWARF error: could not find abbrev number 121
   drivers/tty/serial/qcom_geni_serial.o: in function `find_clk_rate_in_tol':
>> qcom_geni_serial.c:(.text+0x764): undefined reference to `__aeabi_uldivmod'
Doug Anderson June 30, 2022, 9:57 p.m. UTC | #4
Hi,

On Thu, Jun 30, 2022 at 10:19 AM Vijaya Krishna Nivarthi (Temp) (QUIC)
<quic_vnivarth@quicinc.com> wrote:
>
> > -----Original Message-----
> > From: Greg KH <gregkh@linuxfoundation.org>
> > Sent: Thursday, June 30, 2022 8:31 PM
> > To: Vijaya Krishna Nivarthi (Temp) (QUIC) <quic_vnivarth@quicinc.com>
> > Cc: agross@kernel.org; bjorn.andersson@linaro.org;
> > konrad.dybcio@somainline.org; jirislaby@kernel.org; linux-arm-
> > msm@vger.kernel.org; linux-serial@vger.kernel.org; linux-
> > kernel@vger.kernel.org; Mukesh Savaliya (QUIC)
> > <quic_msavaliy@quicinc.com>; dianders@chromium.org;
> > mka@chromium.org; swboyd@chromium.org
> > Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which
> > otherwise could return a sub-optimal clock rate.
> >
> > WARNING: This email originated from outside of Qualcomm. Please be wary
> > of any links or attachments, and do not enable macros.
> >
> > On Wed, Jun 29, 2022 at 03:30:41PM +0530, Vijaya Krishna Nivarthi wrote:
> > > In the logic around call to clk_round_rate(), for some corner
> > > conditions,
> > > get_clk_div_rate() could return an sub-optimal clock rate. Also, if an
> > > exact clock rate was not found lowest clock was being returned.
> > >
> > > Search for suitable clock rate in 2 steps
> > > a) exact match or within 2% tolerance
> > > b) within 5% tolerance
> > > This also takes care of corner conditions.
> > >
> > > Reported-by: kernel test robot <lkp@intel.com>
> >
> > Did the test robot really report the original issue, or just the v2 change?
> >
> > thanks,
> >
> > greg k-h
>
> Test robot raised error for v1 patch and (I think) it got addressed in v2 with call to div_u64.
> V2 doesn't have this error but other warnings which I am addressing along with other feedback.
> Below is the error raised for v1.

I think the adding of the "Reported-by" only really makes sense if the
commit landed and then you fixed the robot-reported bug in a separate
commit. If it reported problems in v1 and you fix them in v2 you
shouldn't add the tag.

-Doug
Vijaya Krishna Nivarthi July 1, 2022, 11:04 a.m. UTC | #5
Hi,


> -----Original Message-----
> From: Doug Anderson <dianders@chromium.org>
> Sent: Thursday, June 30, 2022 4:45 AM
> To: Vijaya Krishna Nivarthi (Temp) (QUIC) <quic_vnivarth@quicinc.com>
> Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org; Konrad
> Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
> <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>; linux-arm-
> msm <linux-arm-msm@vger.kernel.org>; linux-serial@vger.kernel.org; LKML
> <linux-kernel@vger.kernel.org>; Mukesh Savaliya (QUIC)
> <quic_msavaliy@quicinc.com>; Matthias Kaehlcke <mka@chromium.org>;
> Stephen Boyd <swboyd@chromium.org>
> Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which
> otherwise could return a sub-optimal clock rate.
> 
> 
> 
> > +                               /* Save the first (lowest freq) within tolerance */
> > +                               ser_clk = freq;
> > +                               *clk_div = new_div;
> > +                               /* no more search for exact match required in 2nd run */
> > +                               if (!exact_match)
> > +                                       break;
> > +                       }
> > +               }
> >
> > -               prev = freq;
> > +               div = freq / desired_clk + 1;
> 
> Can't you infinite loop now?
> 
> Start with:
> 
> desired_clk = 10000
> div = 1
> percent_tol = 2
> 
> 
> Now:
> 
> mult = 10000
> offset = 200
> test_freq = 9800
> freq = 9800
> div = 9800 / 10000 + 1 = 0 + 1 = 1
> 
> ...and then you'll loop again with "div = 1", won't you? ...or did I get
> something wrong in my analysis? This is the reason my proposed algorithm
> had two loops.
> 
> 

I went back to your proposed algorithm and made couple of simple changes, and it seemed like what we need.

a) look only for exact match once a clock rate within tolerance is found
b) swap test_freq and freq at end of while loops to make it run as desired


	maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
	div = 1;

	while (div < maxdiv) {
		mult = (unsigned long long)div * desired_clk;
		if (mult != (unsigned long)mult)
			break;

		if (ser_clk)
			offset = 0;
		===================a=====================
		else
			offset = div_u64(mult * percent_tol, 100);

		/*
		 * Loop requesting (freq - 2%) and possibly (freq).
		 *
		 * We'll keep track of the lowest freq inexact match we found
		 * but always try to find a perfect match. NOTE: this algorithm
		 * could miss a slightly better freq if there's more than one
		 * freq between (freq - 2%) and (freq) but (freq) can't be made
		 * exactly, but that's OK.
		 *
		 * This absolutely relies on the fact that the Qualcomm clock
		 * driver always rounds up.
		 */
		test_freq = mult - offset;
		while (test_freq <= mult) {
			freq = clk_round_rate(clk, test_freq);

			/*
			 * A dead-on freq is an insta-win. This implicitly
			 * handles when "freq == mult"
			 */
			if (!(freq % desired_clk)) {
				*clk_div = freq / desired_clk;
				return freq;
			}

			/*
			 * Only time clock framework doesn't round up is if
			 * we're past the max clock rate. We're done searching
			 * if that's the case.
			 */
			if (freq < test_freq)
				return ser_clk;

			/* Save the first (lowest freq) within tolerance */
			if (!ser_clk && freq <= mult + offset) {
				ser_clk = freq;
				*clk_div = div;
			}

			/*
			 * If we already rounded up past mult then this will
			 * cause the loop to exit. If not then this will run
			 * the loop a second time with exactly mult.
			 */
			test_freq = max(test_freq + 1, mult);
			                             ====b====
		}

		/*
		 * freq will always be bigger than mult by at least 1.
		 * That means we can get the next divider with a DIV_ROUND_UP.
		 * This has the advantage of skipping by a whole bunch of divs
		 * If the clock framework already bypassed them.
		 */
		div = DIV_ROUND_UP(freq, desired_clk);
		                                       ===b==
	}


Will also drop exact_match now.

Will upload v3 after testing.

Thank you,
Vijay/
Doug Anderson July 1, 2022, 3:08 p.m. UTC | #6
Hi,

On Fri, Jul 1, 2022 at 4:04 AM Vijaya Krishna Nivarthi (Temp) (QUIC)
<quic_vnivarth@quicinc.com> wrote:
>
> Hi,
>
>
> > -----Original Message-----
> > From: Doug Anderson <dianders@chromium.org>
> > Sent: Thursday, June 30, 2022 4:45 AM
> > To: Vijaya Krishna Nivarthi (Temp) (QUIC) <quic_vnivarth@quicinc.com>
> > Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org; Konrad
> > Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
> > <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>; linux-arm-
> > msm <linux-arm-msm@vger.kernel.org>; linux-serial@vger.kernel.org; LKML
> > <linux-kernel@vger.kernel.org>; Mukesh Savaliya (QUIC)
> > <quic_msavaliy@quicinc.com>; Matthias Kaehlcke <mka@chromium.org>;
> > Stephen Boyd <swboyd@chromium.org>
> > Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which
> > otherwise could return a sub-optimal clock rate.
> >
> >
> >
> > > +                               /* Save the first (lowest freq) within tolerance */
> > > +                               ser_clk = freq;
> > > +                               *clk_div = new_div;
> > > +                               /* no more search for exact match required in 2nd run */
> > > +                               if (!exact_match)
> > > +                                       break;
> > > +                       }
> > > +               }
> > >
> > > -               prev = freq;
> > > +               div = freq / desired_clk + 1;
> >
> > Can't you infinite loop now?
> >
> > Start with:
> >
> > desired_clk = 10000
> > div = 1
> > percent_tol = 2
> >
> >
> > Now:
> >
> > mult = 10000
> > offset = 200
> > test_freq = 9800
> > freq = 9800
> > div = 9800 / 10000 + 1 = 0 + 1 = 1
> >
> > ...and then you'll loop again with "div = 1", won't you? ...or did I get
> > something wrong in my analysis? This is the reason my proposed algorithm
> > had two loops.
> >
> >
>
> I went back to your proposed algorithm and made couple of simple changes, and it seemed like what we need.
>
> a) look only for exact match once a clock rate within tolerance is found
> b) swap test_freq and freq at end of while loops to make it run as desired
>
>
>         maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
>         div = 1;
>
>         while (div < maxdiv) {
>                 mult = (unsigned long long)div * desired_clk;
>                 if (mult != (unsigned long)mult)
>                         break;
>
>                 if (ser_clk)
>                         offset = 0;
>                 ===================a=====================
>                 else
>                         offset = div_u64(mult * percent_tol, 100);
>
>                 /*
>                  * Loop requesting (freq - 2%) and possibly (freq).
>                  *
>                  * We'll keep track of the lowest freq inexact match we found
>                  * but always try to find a perfect match. NOTE: this algorithm
>                  * could miss a slightly better freq if there's more than one
>                  * freq between (freq - 2%) and (freq) but (freq) can't be made
>                  * exactly, but that's OK.
>                  *
>                  * This absolutely relies on the fact that the Qualcomm clock
>                  * driver always rounds up.
>                  */
>                 test_freq = mult - offset;
>                 while (test_freq <= mult) {
>                         freq = clk_round_rate(clk, test_freq);
>
>                         /*
>                          * A dead-on freq is an insta-win. This implicitly
>                          * handles when "freq == mult"
>                          */
>                         if (!(freq % desired_clk)) {
>                                 *clk_div = freq / desired_clk;
>                                 return freq;
>                         }
>
>                         /*
>                          * Only time clock framework doesn't round up is if
>                          * we're past the max clock rate. We're done searching
>                          * if that's the case.
>                          */
>                         if (freq < test_freq)
>                                 return ser_clk;
>
>                         /* Save the first (lowest freq) within tolerance */
>                         if (!ser_clk && freq <= mult + offset) {
>                                 ser_clk = freq;
>                                 *clk_div = div;
>                         }
>
>                         /*
>                          * If we already rounded up past mult then this will
>                          * cause the loop to exit. If not then this will run
>                          * the loop a second time with exactly mult.
>                          */
>                         test_freq = max(test_freq + 1, mult);
>                                                      ====b====
>                 }
>
>                 /*
>                  * freq will always be bigger than mult by at least 1.
>                  * That means we can get the next divider with a DIV_ROUND_UP.
>                  * This has the advantage of skipping by a whole bunch of divs
>                  * If the clock framework already bypassed them.
>                  */
>                 div = DIV_ROUND_UP(freq, desired_clk);
>                                                        ===b==
>         }
>
>
> Will also drop exact_match now.
>
> Will upload v3 after testing.

The more I've been thinking about it, the more I wonder if we even
need the special case of looking for an exact match at all. It feels
like we should choose one: we either look for the best match or we
look for the one with the lowest clock source rate. The weird
half-half approach that we have right now feels like over-engineering
and complicates things.

How about this (again, only lightly tested). Worst case if we _truly_
need a close-to-exact match we could pass a tolerance of 0 in and we'd
get something that's nearly exact, though I'm not suggesting we
actually do that. If we think 2% is good enough then we should just
accept the first (and lowest clock rate) 2% match we find.

    abs_tol = div_u64((u64)desired_clk * percent_tol, 100);
    maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
    div = 1;
    while (div <= maxdiv) {
        mult = (u64)div * desired_clk;
        if (mult != (unsigned long)mult)
            break;

        offset = div * abs_tol;
        freq = clk_round_rate(clk, mult - offset);

        /* Can only get lower if we're done */
        if (freq < mult - offset)
            break;

        /*
         * Re-calculate div in case rounding skipped rates but we
         * ended up at a good one, then check for a match.
         */
        div = DIV_ROUND_CLOSEST(freq, desired_clk);
        achieved = DIV_ROUND_CLOSEST(freq, div);
        if (achieved <= desired_clk + abs_tol &&
            achieved >= desired_clk - abs_tol) {
            *clk_div = div;
            return freq;
        }

        /*
         * Always increase div by at least one, but we'll go more than
         * one if clk_round_rate() gave us something higher.
         */
        div = DIV_ROUND_UP(max(freq, (unsigned long)mult) + 1, desired_clk);
        }

    return 0;
Vijaya Krishna Nivarthi (Temp) July 4, 2022, 6:57 p.m. UTC | #7
Hi,


> -----Original Message-----
> From: Doug Anderson <dianders@chromium.org>
> Sent: Friday, July 1, 2022 8:38 PM
> To: Vijaya Krishna Nivarthi (Temp) (QUIC) <quic_vnivarth@quicinc.com>
> Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org; Konrad
> Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
> <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>; linux-arm-
> msm <linux-arm-msm@vger.kernel.org>; linux-serial@vger.kernel.org; LKML
> <linux-kernel@vger.kernel.org>; Mukesh Savaliya (QUIC)
> <quic_msavaliy@quicinc.com>; Matthias Kaehlcke <mka@chromium.org>;
> Stephen Boyd <swboyd@chromium.org>
> Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which
> otherwise could return a sub-optimal clock rate.
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Hi,
> 
> On Fri, Jul 1, 2022 at 4:04 AM Vijaya Krishna Nivarthi (Temp) (QUIC)
> <quic_vnivarth@quicinc.com> wrote:
> >
> > Hi,
> >
> >
> > > -----Original Message-----
> > > From: Doug Anderson <dianders@chromium.org>
> > > Sent: Thursday, June 30, 2022 4:45 AM
> > > To: Vijaya Krishna Nivarthi (Temp) (QUIC)
> > > <quic_vnivarth@quicinc.com>
> > > Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org;
> > > Konrad Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
> > > <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>;
> > > linux-arm- msm <linux-arm-msm@vger.kernel.org>;
> > > linux-serial@vger.kernel.org; LKML <linux-kernel@vger.kernel.org>;
> > > Mukesh Savaliya (QUIC) <quic_msavaliy@quicinc.com>; Matthias
> > > Kaehlcke <mka@chromium.org>; Stephen Boyd
> <swboyd@chromium.org>
> > > Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix
> > > get_clk_div_rate() which otherwise could return a sub-optimal clock rate.
> > >
> > >
> > >
> > > > +                               /* Save the first (lowest freq) within tolerance */
> > > > +                               ser_clk = freq;
> > > > +                               *clk_div = new_div;
> > > > +                               /* no more search for exact match required in 2nd run
> */
> > > > +                               if (!exact_match)
> > > > +                                       break;
> > > > +                       }
> > > > +               }
> > > >
> > > > -               prev = freq;
> > > > +               div = freq / desired_clk + 1;
> > >
> > > Can't you infinite loop now?
> > >
> > > Start with:
> > >
> > > desired_clk = 10000
> > > div = 1
> > > percent_tol = 2
> > >
> > >
> > > Now:
> > >
> > > mult = 10000
> > > offset = 200
> > > test_freq = 9800
> > > freq = 9800
> > > div = 9800 / 10000 + 1 = 0 + 1 = 1
> > >
> > > ...and then you'll loop again with "div = 1", won't you? ...or did I
> > > get something wrong in my analysis? This is the reason my proposed
> > > algorithm had two loops.
> > >
> > >
> >
> > I went back to your proposed algorithm and made couple of simple
> changes, and it seemed like what we need.
> >
> > a) look only for exact match once a clock rate within tolerance is
> > found
> > b) swap test_freq and freq at end of while loops to make it run as
> > desired
> >
> >
> >         maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
> >         div = 1;
> >
> >         while (div < maxdiv) {
> >                 mult = (unsigned long long)div * desired_clk;
> >                 if (mult != (unsigned long)mult)
> >                         break;
> >
> >                 if (ser_clk)
> >                         offset = 0;
> >                 ===================a=====================
> >                 else
> >                         offset = div_u64(mult * percent_tol, 100);
> >
> >                 /*
> >                  * Loop requesting (freq - 2%) and possibly (freq).
> >                  *
> >                  * We'll keep track of the lowest freq inexact match we found
> >                  * but always try to find a perfect match. NOTE: this algorithm
> >                  * could miss a slightly better freq if there's more than one
> >                  * freq between (freq - 2%) and (freq) but (freq) can't be made
> >                  * exactly, but that's OK.
> >                  *
> >                  * This absolutely relies on the fact that the Qualcomm clock
> >                  * driver always rounds up.
> >                  */
> >                 test_freq = mult - offset;
> >                 while (test_freq <= mult) {
> >                         freq = clk_round_rate(clk, test_freq);
> >
> >                         /*
> >                          * A dead-on freq is an insta-win. This implicitly
> >                          * handles when "freq == mult"
> >                          */
> >                         if (!(freq % desired_clk)) {
> >                                 *clk_div = freq / desired_clk;
> >                                 return freq;
> >                         }
> >
> >                         /*
> >                          * Only time clock framework doesn't round up is if
> >                          * we're past the max clock rate. We're done searching
> >                          * if that's the case.
> >                          */
> >                         if (freq < test_freq)
> >                                 return ser_clk;
> >
> >                         /* Save the first (lowest freq) within tolerance */
> >                         if (!ser_clk && freq <= mult + offset) {
> >                                 ser_clk = freq;
> >                                 *clk_div = div;
> >                         }
> >
> >                         /*
> >                          * If we already rounded up past mult then this will
> >                          * cause the loop to exit. If not then this will run
> >                          * the loop a second time with exactly mult.
> >                          */
> >                         test_freq = max(test_freq + 1, mult);
> >                                                      ====b====
> >                 }
> >
> >                 /*
> >                  * freq will always be bigger than mult by at least 1.
> >                  * That means we can get the next divider with a DIV_ROUND_UP.
> >                  * This has the advantage of skipping by a whole bunch of divs
> >                  * If the clock framework already bypassed them.
> >                  */
> >                 div = DIV_ROUND_UP(freq, desired_clk);
> >                                                        ===b==
> >         }
> >
> >
> > Will also drop exact_match now.
> >
> > Will upload v3 after testing.
> 
> The more I've been thinking about it, the more I wonder if we even need the
> special case of looking for an exact match at all. It feels like we should choose
> one: we either look for the best match or we look for the one with the
> lowest clock source rate. The weird half-half approach that we have right
> now feels like over-engineering and complicates things.
> 
> How about this (again, only lightly tested). Worst case if we _truly_ need a
> close-to-exact match we could pass a tolerance of 0 in and we'd get
> something that's nearly exact, though I'm not suggesting we actually do that.
> If we think 2% is good enough then we should just accept the first (and
> lowest clock rate) 2% match we find.
> 
>     abs_tol = div_u64((u64)desired_clk * percent_tol, 100);
>     maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
>     div = 1;
>     while (div <= maxdiv) {
>         mult = (u64)div * desired_clk;
>         if (mult != (unsigned long)mult)
>             break;
> 
>         offset = div * abs_tol;
>         freq = clk_round_rate(clk, mult - offset);
> 
>         /* Can only get lower if we're done */
>         if (freq < mult - offset)
>             break;
> 
>         /*
>          * Re-calculate div in case rounding skipped rates but we
>          * ended up at a good one, then check for a match.
>          */
>         div = DIV_ROUND_CLOSEST(freq, desired_clk);
>         achieved = DIV_ROUND_CLOSEST(freq, div);
>         if (achieved <= desired_clk + abs_tol &&
>             achieved >= desired_clk - abs_tol) {
>             *clk_div = div;
>             return freq;
>         }
> 
>         /*
>          * Always increase div by at least one, but we'll go more than
>          * one if clk_round_rate() gave us something higher.
>          */
>         div = DIV_ROUND_UP(max(freq, (unsigned long)mult) + 1, desired_clk);

Wouldn’t DIV_ROUND_UP(freq, desired_clk) suffice here?
freq >= mult-offset, else we would have hit break.
Additionally if freq <= mult we would have hit return.
So always freq > mult?

And hence div++ would do the same?

Thank you.



>         }
> 
>     return 0;
kernel test robot July 4, 2022, 9:20 p.m. UTC | #8
Hi Vijaya,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on tty/tty-testing]
[also build test WARNING on linus/master v5.19-rc5 next-20220704]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Vijaya-Krishna-Nivarthi/tty-serial-qcom-geni-serial-Fix-get_clk_div_rate-which-otherwise-could-return-a-sub-optimal-clock-rate/20220629-180330
base:   https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git tty-testing
config: hexagon-buildonly-randconfig-r006-20220703 (https://download.01.org/0day-ci/archive/20220705/202207050527.wrtnyin5-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project f7a80c3d08d4821e621fc88d6a2e435291f82dff)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/a70b5a9759aef627b6882576f38399ed8c092b74
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Vijaya-Krishna-Nivarthi/tty-serial-qcom-geni-serial-Fix-get_clk_div_rate-which-otherwise-could-return-a-sub-optimal-clock-rate/20220629-180330
        git checkout a70b5a9759aef627b6882576f38399ed8c092b74
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash drivers/tty/serial/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/tty/serial/qcom_geni_serial.c:1044:56: warning: format specifies type 'int' but the argument has type 'unsigned long' [-Wformat]
                   pr_err("Couldn't find suitable clock rate for %d\n", desired_clk);
                                                                 ~~     ^~~~~~~~~~~
                                                                 %lu
   include/linux/printk.h:523:33: note: expanded from macro 'pr_err'
           printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
                                  ~~~     ^~~~~~~~~~~
   include/linux/printk.h:480:60: note: expanded from macro 'printk'
   #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
                                                       ~~~    ^~~~~~~~~~~
   include/linux/printk.h:452:19: note: expanded from macro 'printk_index_wrap'
                   _p_func(_fmt, ##__VA_ARGS__);                           \
                           ~~~~    ^~~~~~~~~~~
   drivers/tty/serial/qcom_geni_serial.c:1047:4: warning: format specifies type 'int' but the argument has type 'unsigned long' [-Wformat]
                           desired_clk, ser_clk, *clk_div);
                           ^~~~~~~~~~~
   include/linux/printk.h:610:38: note: expanded from macro 'pr_debug'
           no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
                                       ~~~     ^~~~~~~~~~~
   include/linux/printk.h:131:17: note: expanded from macro 'no_printk'
                   printk(fmt, ##__VA_ARGS__);             \
                          ~~~    ^~~~~~~~~~~
   include/linux/printk.h:480:60: note: expanded from macro 'printk'
   #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
                                                       ~~~    ^~~~~~~~~~~
   include/linux/printk.h:452:19: note: expanded from macro 'printk_index_wrap'
                   _p_func(_fmt, ##__VA_ARGS__);                           \
                           ~~~~    ^~~~~~~~~~~
   drivers/tty/serial/qcom_geni_serial.c:1047:17: warning: format specifies type 'int' but the argument has type 'unsigned long' [-Wformat]
                           desired_clk, ser_clk, *clk_div);
                                        ^~~~~~~
   include/linux/printk.h:610:38: note: expanded from macro 'pr_debug'
           no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
                                       ~~~     ^~~~~~~~~~~
   include/linux/printk.h:131:17: note: expanded from macro 'no_printk'
                   printk(fmt, ##__VA_ARGS__);             \
                          ~~~    ^~~~~~~~~~~
   include/linux/printk.h:480:60: note: expanded from macro 'printk'
   #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
                                                       ~~~    ^~~~~~~~~~~
   include/linux/printk.h:452:19: note: expanded from macro 'printk_index_wrap'
                   _p_func(_fmt, ##__VA_ARGS__);                           \
                           ~~~~    ^~~~~~~~~~~
   3 warnings generated.


vim +1044 drivers/tty/serial/qcom_geni_serial.c

  1021	
  1022	static unsigned long get_clk_div_rate(struct clk *clk, unsigned int baud,
  1023				unsigned int sampling_rate, unsigned int *clk_div)
  1024	{
  1025		unsigned long ser_clk;
  1026		unsigned long desired_clk;
  1027	
  1028		desired_clk = baud * sampling_rate;
  1029		if (!desired_clk) {
  1030			pr_err("%s: Invalid frequency\n", __func__);
  1031			return 0;
  1032		}
  1033	
  1034		ser_clk = 0;
  1035		/*
  1036		 * try to find exact clock rate or within 2% tolerance,
  1037		 * then within 5% tolerance
  1038		 */
  1039		ser_clk = find_clk_rate_in_tol(clk, desired_clk, clk_div, 2, true);
  1040		if (!ser_clk)
  1041			ser_clk = find_clk_rate_in_tol(clk, desired_clk, clk_div, 5, false);
  1042	
  1043		if (!ser_clk)
> 1044			pr_err("Couldn't find suitable clock rate for %d\n", desired_clk);
  1045		else
  1046			pr_debug("desired_clk-%d, ser_clk-%d, clk_div-%d\n",
  1047				desired_clk, ser_clk, *clk_div);
  1048	
  1049		return ser_clk;
  1050	}
  1051
Doug Anderson July 6, 2022, 3:26 p.m. UTC | #9
Hi,

On Mon, Jul 4, 2022 at 11:57 AM Vijaya Krishna Nivarthi (Temp)
<vnivarth@qti.qualcomm.com> wrote:
>
> Hi,
>
>
> > -----Original Message-----
> > From: Doug Anderson <dianders@chromium.org>
> > Sent: Friday, July 1, 2022 8:38 PM
> > To: Vijaya Krishna Nivarthi (Temp) (QUIC) <quic_vnivarth@quicinc.com>
> > Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org; Konrad
> > Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
> > <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>; linux-arm-
> > msm <linux-arm-msm@vger.kernel.org>; linux-serial@vger.kernel.org; LKML
> > <linux-kernel@vger.kernel.org>; Mukesh Savaliya (QUIC)
> > <quic_msavaliy@quicinc.com>; Matthias Kaehlcke <mka@chromium.org>;
> > Stephen Boyd <swboyd@chromium.org>
> > Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which
> > otherwise could return a sub-optimal clock rate.
> >
> > WARNING: This email originated from outside of Qualcomm. Please be wary
> > of any links or attachments, and do not enable macros.
> >
> > Hi,
> >
> > On Fri, Jul 1, 2022 at 4:04 AM Vijaya Krishna Nivarthi (Temp) (QUIC)
> > <quic_vnivarth@quicinc.com> wrote:
> > >
> > > Hi,
> > >
> > >
> > > > -----Original Message-----
> > > > From: Doug Anderson <dianders@chromium.org>
> > > > Sent: Thursday, June 30, 2022 4:45 AM
> > > > To: Vijaya Krishna Nivarthi (Temp) (QUIC)
> > > > <quic_vnivarth@quicinc.com>
> > > > Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org;
> > > > Konrad Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
> > > > <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>;
> > > > linux-arm- msm <linux-arm-msm@vger.kernel.org>;
> > > > linux-serial@vger.kernel.org; LKML <linux-kernel@vger.kernel.org>;
> > > > Mukesh Savaliya (QUIC) <quic_msavaliy@quicinc.com>; Matthias
> > > > Kaehlcke <mka@chromium.org>; Stephen Boyd
> > <swboyd@chromium.org>
> > > > Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix
> > > > get_clk_div_rate() which otherwise could return a sub-optimal clock rate.
> > > >
> > > >
> > > >
> > > > > +                               /* Save the first (lowest freq) within tolerance */
> > > > > +                               ser_clk = freq;
> > > > > +                               *clk_div = new_div;
> > > > > +                               /* no more search for exact match required in 2nd run
> > */
> > > > > +                               if (!exact_match)
> > > > > +                                       break;
> > > > > +                       }
> > > > > +               }
> > > > >
> > > > > -               prev = freq;
> > > > > +               div = freq / desired_clk + 1;
> > > >
> > > > Can't you infinite loop now?
> > > >
> > > > Start with:
> > > >
> > > > desired_clk = 10000
> > > > div = 1
> > > > percent_tol = 2
> > > >
> > > >
> > > > Now:
> > > >
> > > > mult = 10000
> > > > offset = 200
> > > > test_freq = 9800
> > > > freq = 9800
> > > > div = 9800 / 10000 + 1 = 0 + 1 = 1
> > > >
> > > > ...and then you'll loop again with "div = 1", won't you? ...or did I
> > > > get something wrong in my analysis? This is the reason my proposed
> > > > algorithm had two loops.
> > > >
> > > >
> > >
> > > I went back to your proposed algorithm and made couple of simple
> > changes, and it seemed like what we need.
> > >
> > > a) look only for exact match once a clock rate within tolerance is
> > > found
> > > b) swap test_freq and freq at end of while loops to make it run as
> > > desired
> > >
> > >
> > >         maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
> > >         div = 1;
> > >
> > >         while (div < maxdiv) {
> > >                 mult = (unsigned long long)div * desired_clk;
> > >                 if (mult != (unsigned long)mult)
> > >                         break;
> > >
> > >                 if (ser_clk)
> > >                         offset = 0;
> > >                 ===================a=====================
> > >                 else
> > >                         offset = div_u64(mult * percent_tol, 100);
> > >
> > >                 /*
> > >                  * Loop requesting (freq - 2%) and possibly (freq).
> > >                  *
> > >                  * We'll keep track of the lowest freq inexact match we found
> > >                  * but always try to find a perfect match. NOTE: this algorithm
> > >                  * could miss a slightly better freq if there's more than one
> > >                  * freq between (freq - 2%) and (freq) but (freq) can't be made
> > >                  * exactly, but that's OK.
> > >                  *
> > >                  * This absolutely relies on the fact that the Qualcomm clock
> > >                  * driver always rounds up.
> > >                  */
> > >                 test_freq = mult - offset;
> > >                 while (test_freq <= mult) {
> > >                         freq = clk_round_rate(clk, test_freq);
> > >
> > >                         /*
> > >                          * A dead-on freq is an insta-win. This implicitly
> > >                          * handles when "freq == mult"
> > >                          */
> > >                         if (!(freq % desired_clk)) {
> > >                                 *clk_div = freq / desired_clk;
> > >                                 return freq;
> > >                         }
> > >
> > >                         /*
> > >                          * Only time clock framework doesn't round up is if
> > >                          * we're past the max clock rate. We're done searching
> > >                          * if that's the case.
> > >                          */
> > >                         if (freq < test_freq)
> > >                                 return ser_clk;
> > >
> > >                         /* Save the first (lowest freq) within tolerance */
> > >                         if (!ser_clk && freq <= mult + offset) {
> > >                                 ser_clk = freq;
> > >                                 *clk_div = div;
> > >                         }
> > >
> > >                         /*
> > >                          * If we already rounded up past mult then this will
> > >                          * cause the loop to exit. If not then this will run
> > >                          * the loop a second time with exactly mult.
> > >                          */
> > >                         test_freq = max(test_freq + 1, mult);
> > >                                                      ====b====
> > >                 }
> > >
> > >                 /*
> > >                  * freq will always be bigger than mult by at least 1.
> > >                  * That means we can get the next divider with a DIV_ROUND_UP.
> > >                  * This has the advantage of skipping by a whole bunch of divs
> > >                  * If the clock framework already bypassed them.
> > >                  */
> > >                 div = DIV_ROUND_UP(freq, desired_clk);
> > >                                                        ===b==
> > >         }
> > >
> > >
> > > Will also drop exact_match now.
> > >
> > > Will upload v3 after testing.
> >
> > The more I've been thinking about it, the more I wonder if we even need the
> > special case of looking for an exact match at all. It feels like we should choose
> > one: we either look for the best match or we look for the one with the
> > lowest clock source rate. The weird half-half approach that we have right
> > now feels like over-engineering and complicates things.
> >
> > How about this (again, only lightly tested). Worst case if we _truly_ need a
> > close-to-exact match we could pass a tolerance of 0 in and we'd get
> > something that's nearly exact, though I'm not suggesting we actually do that.
> > If we think 2% is good enough then we should just accept the first (and
> > lowest clock rate) 2% match we find.
> >
> >     abs_tol = div_u64((u64)desired_clk * percent_tol, 100);
> >     maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
> >     div = 1;
> >     while (div <= maxdiv) {
> >         mult = (u64)div * desired_clk;
> >         if (mult != (unsigned long)mult)
> >             break;
> >
> >         offset = div * abs_tol;
> >         freq = clk_round_rate(clk, mult - offset);
> >
> >         /* Can only get lower if we're done */
> >         if (freq < mult - offset)
> >             break;
> >
> >         /*
> >          * Re-calculate div in case rounding skipped rates but we
> >          * ended up at a good one, then check for a match.
> >          */
> >         div = DIV_ROUND_CLOSEST(freq, desired_clk);
> >         achieved = DIV_ROUND_CLOSEST(freq, div);
> >         if (achieved <= desired_clk + abs_tol &&
> >             achieved >= desired_clk - abs_tol) {
> >             *clk_div = div;
> >             return freq;
> >         }
> >
> >         /*
> >          * Always increase div by at least one, but we'll go more than
> >          * one if clk_round_rate() gave us something higher.
> >          */
> >         div = DIV_ROUND_UP(max(freq, (unsigned long)mult) + 1, desired_clk);
>
> Wouldn’t DIV_ROUND_UP(freq, desired_clk) suffice here?
> freq >= mult-offset, else we would have hit break.

No. As you say, freq >= "mult-offset". That means that freq could be
== "mult-offset", right? If offset > 0 then freq could be < mult. Then
your DIV_ROUND_UP() would just take you right back to where you
started the loop with and you'd end up with an infinite loop, wouldn't
you?


> Additionally if freq <= mult we would have hit return.
> So always freq > mult?
>
> And hence div++ would do the same?

I thought about it and I decided that it might not if the clock
framework skipped a whole bunch. Let's see if I can give an example.

Let's say
* "desired_clk" is 10000
* "percent_tol" is 2 (abs_tol = 200)
* We can make clocks 17000, 20000, 25000.

First time through the loop:

mult = 10000
offset = 200
freq = 17000
div = 2
achieved = 8500 (not within tolerance)

...at the end of the loop if we do "div++" then we'll end up with
div=3 for the next loop and we'll miss finding 20000.
...but if we do my math, we end up with:

DIV_ROUND_UP(max(17000, 10000) + 1, 10000)
DIV_ROUND_UP(17000 + 1, 10000)
DIV_ROUND_UP(17000, 10000)
2

...and that's exactly what we want.


Here's an example showing why the line "div = DIV_ROUND_CLOSEST(freq,
desired_clk)" is important:

* "desired_clk" is 10000
* "percent_tol" is 2 (abs_tol = 200)
* We can make clocks 19600, 25000.

mult = 10000
offset = 200
freq = 19600
div = 2
achieved = 9800

Returns 19600 and div=2


Here's an example showing how the clock framework rounding lets us
skip some "div"s without missing anything important:

* "desired_clk" is 10000
* "percent_tol" is 2 (abs_tol = 200)
* We can make clocks 24000, 30000.

mult = 25000
offset = 200
freq = 24000
div = 2
achieved = 12000 (not within tolerance)

div = DIV_ROUND_UP(max(24000, 10000) + 1, 10000)
div = 3

mult = 30000
offset = 600
freq = 30000
div = 3

-Doug
Vijaya Krishna Nivarthi July 6, 2022, 5:43 p.m. UTC | #10
Hi,


On 7/6/2022 8:56 PM, Doug Anderson wrote:
> Hi,
>
> On Mon, Jul 4, 2022 at 11:57 AM Vijaya Krishna Nivarthi (Temp)
> <vnivarth@qti.qualcomm.com> wrote:
>> Hi,
>>
>>
>>> -----Original Message-----
>>> From: Doug Anderson <dianders@chromium.org>
>>> Sent: Friday, July 1, 2022 8:38 PM
>>> To: Vijaya Krishna Nivarthi (Temp) (QUIC) <quic_vnivarth@quicinc.com>
>>> Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org; Konrad
>>> Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
>>> <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>; linux-arm-
>>> msm <linux-arm-msm@vger.kernel.org>; linux-serial@vger.kernel.org; LKML
>>> <linux-kernel@vger.kernel.org>; Mukesh Savaliya (QUIC)
>>> <quic_msavaliy@quicinc.com>; Matthias Kaehlcke <mka@chromium.org>;
>>> Stephen Boyd <swboyd@chromium.org>
>>> Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which
>>> otherwise could return a sub-optimal clock rate.
>>>
>>> WARNING: This email originated from outside of Qualcomm. Please be wary
>>> of any links or attachments, and do not enable macros.
>>>
>>> Hi,
>>>
>>> On Fri, Jul 1, 2022 at 4:04 AM Vijaya Krishna Nivarthi (Temp) (QUIC)
>>> <quic_vnivarth@quicinc.com> wrote:
>>>> Hi,
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Doug Anderson <dianders@chromium.org>
>>>>> Sent: Thursday, June 30, 2022 4:45 AM
>>>>> To: Vijaya Krishna Nivarthi (Temp) (QUIC)
>>>>> <quic_vnivarth@quicinc.com>
>>>>> Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org;
>>>>> Konrad Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
>>>>> <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>;
>>>>> linux-arm- msm <linux-arm-msm@vger.kernel.org>;
>>>>> linux-serial@vger.kernel.org; LKML <linux-kernel@vger.kernel.org>;
>>>>> Mukesh Savaliya (QUIC) <quic_msavaliy@quicinc.com>; Matthias
>>>>> Kaehlcke <mka@chromium.org>; Stephen Boyd
>>> <swboyd@chromium.org>
>>>>> Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix
>>>>> get_clk_div_rate() which otherwise could return a sub-optimal clock rate.
>>>>>
>>>>>
>>>>>
>>>>>> +                               /* Save the first (lowest freq) within tolerance */
>>>>>> +                               ser_clk = freq;
>>>>>> +                               *clk_div = new_div;
>>>>>> +                               /* no more search for exact match required in 2nd run
>>> */
>>>>>> +                               if (!exact_match)
>>>>>> +                                       break;
>>>>>> +                       }
>>>>>> +               }
>>>>>>
>>>>>> -               prev = freq;
>>>>>> +               div = freq / desired_clk + 1;
>>>>> Can't you infinite loop now?
>>>>>
>>>>> Start with:
>>>>>
>>>>> desired_clk = 10000
>>>>> div = 1
>>>>> percent_tol = 2
>>>>>
>>>>>
>>>>> Now:
>>>>>
>>>>> mult = 10000
>>>>> offset = 200
>>>>> test_freq = 9800
>>>>> freq = 9800
>>>>> div = 9800 / 10000 + 1 = 0 + 1 = 1
>>>>>
>>>>> ...and then you'll loop again with "div = 1", won't you? ...or did I
>>>>> get something wrong in my analysis? This is the reason my proposed
>>>>> algorithm had two loops.
>>>>>
>>>>>
>>>> I went back to your proposed algorithm and made couple of simple
>>> changes, and it seemed like what we need.
>>>> a) look only for exact match once a clock rate within tolerance is
>>>> found
>>>> b) swap test_freq and freq at end of while loops to make it run as
>>>> desired
>>>>
>>>>
>>>>          maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
>>>>          div = 1;
>>>>
>>>>          while (div < maxdiv) {
>>>>                  mult = (unsigned long long)div * desired_clk;
>>>>                  if (mult != (unsigned long)mult)
>>>>                          break;
>>>>
>>>>                  if (ser_clk)
>>>>                          offset = 0;
>>>>                  ===================a=====================
>>>>                  else
>>>>                          offset = div_u64(mult * percent_tol, 100);
>>>>
>>>>                  /*
>>>>                   * Loop requesting (freq - 2%) and possibly (freq).
>>>>                   *
>>>>                   * We'll keep track of the lowest freq inexact match we found
>>>>                   * but always try to find a perfect match. NOTE: this algorithm
>>>>                   * could miss a slightly better freq if there's more than one
>>>>                   * freq between (freq - 2%) and (freq) but (freq) can't be made
>>>>                   * exactly, but that's OK.
>>>>                   *
>>>>                   * This absolutely relies on the fact that the Qualcomm clock
>>>>                   * driver always rounds up.
>>>>                   */
>>>>                  test_freq = mult - offset;
>>>>                  while (test_freq <= mult) {
>>>>                          freq = clk_round_rate(clk, test_freq);
>>>>
>>>>                          /*
>>>>                           * A dead-on freq is an insta-win. This implicitly
>>>>                           * handles when "freq == mult"
>>>>                           */
>>>>                          if (!(freq % desired_clk)) {
>>>>                                  *clk_div = freq / desired_clk;
>>>>                                  return freq;
>>>>                          }
>>>>
>>>>                          /*
>>>>                           * Only time clock framework doesn't round up is if
>>>>                           * we're past the max clock rate. We're done searching
>>>>                           * if that's the case.
>>>>                           */
>>>>                          if (freq < test_freq)
>>>>                                  return ser_clk;
>>>>
>>>>                          /* Save the first (lowest freq) within tolerance */
>>>>                          if (!ser_clk && freq <= mult + offset) {
>>>>                                  ser_clk = freq;
>>>>                                  *clk_div = div;
>>>>                          }
>>>>
>>>>                          /*
>>>>                           * If we already rounded up past mult then this will
>>>>                           * cause the loop to exit. If not then this will run
>>>>                           * the loop a second time with exactly mult.
>>>>                           */
>>>>                          test_freq = max(test_freq + 1, mult);
>>>>                                                       ====b====
>>>>                  }
>>>>
>>>>                  /*
>>>>                   * freq will always be bigger than mult by at least 1.
>>>>                   * That means we can get the next divider with a DIV_ROUND_UP.
>>>>                   * This has the advantage of skipping by a whole bunch of divs
>>>>                   * If the clock framework already bypassed them.
>>>>                   */
>>>>                  div = DIV_ROUND_UP(freq, desired_clk);
>>>>                                                         ===b==
>>>>          }
>>>>
>>>>
>>>> Will also drop exact_match now.
>>>>
>>>> Will upload v3 after testing.
>>> The more I've been thinking about it, the more I wonder if we even need the
>>> special case of looking for an exact match at all. It feels like we should choose
>>> one: we either look for the best match or we look for the one with the
>>> lowest clock source rate. The weird half-half approach that we have right
>>> now feels like over-engineering and complicates things.
>>>
>>> How about this (again, only lightly tested). Worst case if we _truly_ need a
>>> close-to-exact match we could pass a tolerance of 0 in and we'd get
>>> something that's nearly exact, though I'm not suggesting we actually do that.
>>> If we think 2% is good enough then we should just accept the first (and
>>> lowest clock rate) 2% match we find.
>>>
>>>      abs_tol = div_u64((u64)desired_clk * percent_tol, 100);
>>>      maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
>>>      div = 1;
>>>      while (div <= maxdiv) {
>>>          mult = (u64)div * desired_clk;
>>>          if (mult != (unsigned long)mult)
>>>              break;
>>>
>>>          offset = div * abs_tol;
>>>          freq = clk_round_rate(clk, mult - offset);
>>>
>>>          /* Can only get lower if we're done */
>>>          if (freq < mult - offset)
>>>              break;
>>>
>>>          /*
>>>           * Re-calculate div in case rounding skipped rates but we
>>>           * ended up at a good one, then check for a match.
>>>           */
>>>          div = DIV_ROUND_CLOSEST(freq, desired_clk);
>>>          achieved = DIV_ROUND_CLOSEST(freq, div);
>>>          if (achieved <= desired_clk + abs_tol &&
>>>              achieved >= desired_clk - abs_tol) {
>>>              *clk_div = div;
>>>              return freq;
>>>          }
>>>
>>>          /*
>>>           * Always increase div by at least one, but we'll go more than
>>>           * one if clk_round_rate() gave us something higher.
>>>           */
>>>          div = DIV_ROUND_UP(max(freq, (unsigned long)mult) + 1, desired_clk);
>> Wouldn’t DIV_ROUND_UP(freq, desired_clk) suffice here?
>> freq >= mult-offset, else we would have hit break.
> No. As you say, freq >= "mult-offset". That means that freq could be
> == "mult-offset", right? If offset > 0 then freq could be < mult. Then
> your DIV_ROUND_UP() would just take you right back to where you
> started the loop with and you'd end up with an infinite loop, wouldn't
> you?
>
Probably No.

( (freq >= mult-offset) && (freq <= mult) ) =>

( (freq >= mult-offset) && (freq <= mult+offset) )

would mean that

div = DIV_ROUND_CLOSEST(freq, desired_clk);
evaluates to original div and we are within tolerance and hence we can return and hence don't even reach DIV_ROUND_UP?

Please note that freq can skip any multiples and land up anywhere.

As long as it has not gone beyond clock rate table, either it lands 
within tolerance of nearest multiple of desired_clk (in which case we 
return)

OR

We move on to next div = (freq/desired_clk + 1)


I retract div++, I was mistaken to believe that DIV_ROUND_UP(freq, 
desired_clk) would be same as div++.

Thank you.

>> Additionally if freq <= mult we would have hit return.
>> So always freq > mult?
>>
>> And hence div++ would do the same?
> I thought about it and I decided that it might not if the clock
> framework skipped a whole bunch. Let's see if I can give an example.
>
> Let's say
> * "desired_clk" is 10000
> * "percent_tol" is 2 (abs_tol = 200)
> * We can make clocks 17000, 20000, 25000.
>
> First time through the loop:
>
> mult = 10000
> offset = 200
> freq = 17000
> div = 2
> achieved = 8500 (not within tolerance)
>
> ...at the end of the loop if we do "div++" then we'll end up with
> div=3 for the next loop and we'll miss finding 20000.
> ...but if we do my math, we end up with:
>
> DIV_ROUND_UP(max(17000, 10000) + 1, 10000)
> DIV_ROUND_UP(17000 + 1, 10000)
> DIV_ROUND_UP(17000, 10000)
> 2
>
> ...and that's exactly what we want.
>
>
> Here's an example showing why the line "div = DIV_ROUND_CLOSEST(freq,
> desired_clk)" is important:
>
> * "desired_clk" is 10000
> * "percent_tol" is 2 (abs_tol = 200)
> * We can make clocks 19600, 25000.
>
> mult = 10000
> offset = 200
> freq = 19600
> div = 2
> achieved = 9800
>
> Returns 19600 and div=2
>
>
> Here's an example showing how the clock framework rounding lets us
> skip some "div"s without missing anything important:
>
> * "desired_clk" is 10000
> * "percent_tol" is 2 (abs_tol = 200)
> * We can make clocks 24000, 30000.
>
> mult = 25000
> offset = 200
> freq = 24000
> div = 2
> achieved = 12000 (not within tolerance)
>
> div = DIV_ROUND_UP(max(24000, 10000) + 1, 10000)
> div = 3
>
> mult = 30000
> offset = 600
> freq = 30000
> div = 3
>
> -Doug
Doug Anderson July 6, 2022, 6:21 p.m. UTC | #11
Hi,

On Wed, Jul 6, 2022 at 10:44 AM Vijaya Krishna Nivarthi
<quic_vnivarth@quicinc.com> wrote:
>
> Hi,
>
>
> On 7/6/2022 8:56 PM, Doug Anderson wrote:
> > Hi,
> >
> > On Mon, Jul 4, 2022 at 11:57 AM Vijaya Krishna Nivarthi (Temp)
> > <vnivarth@qti.qualcomm.com> wrote:
> >> Hi,
> >>
> >>
> >>> -----Original Message-----
> >>> From: Doug Anderson <dianders@chromium.org>
> >>> Sent: Friday, July 1, 2022 8:38 PM
> >>> To: Vijaya Krishna Nivarthi (Temp) (QUIC) <quic_vnivarth@quicinc.com>
> >>> Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org; Konrad
> >>> Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
> >>> <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>; linux-arm-
> >>> msm <linux-arm-msm@vger.kernel.org>; linux-serial@vger.kernel.org; LKML
> >>> <linux-kernel@vger.kernel.org>; Mukesh Savaliya (QUIC)
> >>> <quic_msavaliy@quicinc.com>; Matthias Kaehlcke <mka@chromium.org>;
> >>> Stephen Boyd <swboyd@chromium.org>
> >>> Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which
> >>> otherwise could return a sub-optimal clock rate.
> >>>
> >>> WARNING: This email originated from outside of Qualcomm. Please be wary
> >>> of any links or attachments, and do not enable macros.
> >>>
> >>> Hi,
> >>>
> >>> On Fri, Jul 1, 2022 at 4:04 AM Vijaya Krishna Nivarthi (Temp) (QUIC)
> >>> <quic_vnivarth@quicinc.com> wrote:
> >>>> Hi,
> >>>>
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: Doug Anderson <dianders@chromium.org>
> >>>>> Sent: Thursday, June 30, 2022 4:45 AM
> >>>>> To: Vijaya Krishna Nivarthi (Temp) (QUIC)
> >>>>> <quic_vnivarth@quicinc.com>
> >>>>> Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org;
> >>>>> Konrad Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
> >>>>> <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>;
> >>>>> linux-arm- msm <linux-arm-msm@vger.kernel.org>;
> >>>>> linux-serial@vger.kernel.org; LKML <linux-kernel@vger.kernel.org>;
> >>>>> Mukesh Savaliya (QUIC) <quic_msavaliy@quicinc.com>; Matthias
> >>>>> Kaehlcke <mka@chromium.org>; Stephen Boyd
> >>> <swboyd@chromium.org>
> >>>>> Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix
> >>>>> get_clk_div_rate() which otherwise could return a sub-optimal clock rate.
> >>>>>
> >>>>>
> >>>>>
> >>>>>> +                               /* Save the first (lowest freq) within tolerance */
> >>>>>> +                               ser_clk = freq;
> >>>>>> +                               *clk_div = new_div;
> >>>>>> +                               /* no more search for exact match required in 2nd run
> >>> */
> >>>>>> +                               if (!exact_match)
> >>>>>> +                                       break;
> >>>>>> +                       }
> >>>>>> +               }
> >>>>>>
> >>>>>> -               prev = freq;
> >>>>>> +               div = freq / desired_clk + 1;
> >>>>> Can't you infinite loop now?
> >>>>>
> >>>>> Start with:
> >>>>>
> >>>>> desired_clk = 10000
> >>>>> div = 1
> >>>>> percent_tol = 2
> >>>>>
> >>>>>
> >>>>> Now:
> >>>>>
> >>>>> mult = 10000
> >>>>> offset = 200
> >>>>> test_freq = 9800
> >>>>> freq = 9800
> >>>>> div = 9800 / 10000 + 1 = 0 + 1 = 1
> >>>>>
> >>>>> ...and then you'll loop again with "div = 1", won't you? ...or did I
> >>>>> get something wrong in my analysis? This is the reason my proposed
> >>>>> algorithm had two loops.
> >>>>>
> >>>>>
> >>>> I went back to your proposed algorithm and made couple of simple
> >>> changes, and it seemed like what we need.
> >>>> a) look only for exact match once a clock rate within tolerance is
> >>>> found
> >>>> b) swap test_freq and freq at end of while loops to make it run as
> >>>> desired
> >>>>
> >>>>
> >>>>          maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
> >>>>          div = 1;
> >>>>
> >>>>          while (div < maxdiv) {
> >>>>                  mult = (unsigned long long)div * desired_clk;
> >>>>                  if (mult != (unsigned long)mult)
> >>>>                          break;
> >>>>
> >>>>                  if (ser_clk)
> >>>>                          offset = 0;
> >>>>                  ===================a=====================
> >>>>                  else
> >>>>                          offset = div_u64(mult * percent_tol, 100);
> >>>>
> >>>>                  /*
> >>>>                   * Loop requesting (freq - 2%) and possibly (freq).
> >>>>                   *
> >>>>                   * We'll keep track of the lowest freq inexact match we found
> >>>>                   * but always try to find a perfect match. NOTE: this algorithm
> >>>>                   * could miss a slightly better freq if there's more than one
> >>>>                   * freq between (freq - 2%) and (freq) but (freq) can't be made
> >>>>                   * exactly, but that's OK.
> >>>>                   *
> >>>>                   * This absolutely relies on the fact that the Qualcomm clock
> >>>>                   * driver always rounds up.
> >>>>                   */
> >>>>                  test_freq = mult - offset;
> >>>>                  while (test_freq <= mult) {
> >>>>                          freq = clk_round_rate(clk, test_freq);
> >>>>
> >>>>                          /*
> >>>>                           * A dead-on freq is an insta-win. This implicitly
> >>>>                           * handles when "freq == mult"
> >>>>                           */
> >>>>                          if (!(freq % desired_clk)) {
> >>>>                                  *clk_div = freq / desired_clk;
> >>>>                                  return freq;
> >>>>                          }
> >>>>
> >>>>                          /*
> >>>>                           * Only time clock framework doesn't round up is if
> >>>>                           * we're past the max clock rate. We're done searching
> >>>>                           * if that's the case.
> >>>>                           */
> >>>>                          if (freq < test_freq)
> >>>>                                  return ser_clk;
> >>>>
> >>>>                          /* Save the first (lowest freq) within tolerance */
> >>>>                          if (!ser_clk && freq <= mult + offset) {
> >>>>                                  ser_clk = freq;
> >>>>                                  *clk_div = div;
> >>>>                          }
> >>>>
> >>>>                          /*
> >>>>                           * If we already rounded up past mult then this will
> >>>>                           * cause the loop to exit. If not then this will run
> >>>>                           * the loop a second time with exactly mult.
> >>>>                           */
> >>>>                          test_freq = max(test_freq + 1, mult);
> >>>>                                                       ====b====
> >>>>                  }
> >>>>
> >>>>                  /*
> >>>>                   * freq will always be bigger than mult by at least 1.
> >>>>                   * That means we can get the next divider with a DIV_ROUND_UP.
> >>>>                   * This has the advantage of skipping by a whole bunch of divs
> >>>>                   * If the clock framework already bypassed them.
> >>>>                   */
> >>>>                  div = DIV_ROUND_UP(freq, desired_clk);
> >>>>                                                         ===b==
> >>>>          }
> >>>>
> >>>>
> >>>> Will also drop exact_match now.
> >>>>
> >>>> Will upload v3 after testing.
> >>> The more I've been thinking about it, the more I wonder if we even need the
> >>> special case of looking for an exact match at all. It feels like we should choose
> >>> one: we either look for the best match or we look for the one with the
> >>> lowest clock source rate. The weird half-half approach that we have right
> >>> now feels like over-engineering and complicates things.
> >>>
> >>> How about this (again, only lightly tested). Worst case if we _truly_ need a
> >>> close-to-exact match we could pass a tolerance of 0 in and we'd get
> >>> something that's nearly exact, though I'm not suggesting we actually do that.
> >>> If we think 2% is good enough then we should just accept the first (and
> >>> lowest clock rate) 2% match we find.
> >>>
> >>>      abs_tol = div_u64((u64)desired_clk * percent_tol, 100);
> >>>      maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
> >>>      div = 1;
> >>>      while (div <= maxdiv) {
> >>>          mult = (u64)div * desired_clk;
> >>>          if (mult != (unsigned long)mult)
> >>>              break;
> >>>
> >>>          offset = div * abs_tol;
> >>>          freq = clk_round_rate(clk, mult - offset);
> >>>
> >>>          /* Can only get lower if we're done */
> >>>          if (freq < mult - offset)
> >>>              break;
> >>>
> >>>          /*
> >>>           * Re-calculate div in case rounding skipped rates but we
> >>>           * ended up at a good one, then check for a match.
> >>>           */
> >>>          div = DIV_ROUND_CLOSEST(freq, desired_clk);
> >>>          achieved = DIV_ROUND_CLOSEST(freq, div);
> >>>          if (achieved <= desired_clk + abs_tol &&
> >>>              achieved >= desired_clk - abs_tol) {
> >>>              *clk_div = div;
> >>>              return freq;
> >>>          }
> >>>
> >>>          /*
> >>>           * Always increase div by at least one, but we'll go more than
> >>>           * one if clk_round_rate() gave us something higher.
> >>>           */
> >>>          div = DIV_ROUND_UP(max(freq, (unsigned long)mult) + 1, desired_clk);
> >> Wouldn’t DIV_ROUND_UP(freq, desired_clk) suffice here?
> >> freq >= mult-offset, else we would have hit break.
> > No. As you say, freq >= "mult-offset". That means that freq could be
> > == "mult-offset", right? If offset > 0 then freq could be < mult. Then
> > your DIV_ROUND_UP() would just take you right back to where you
> > started the loop with and you'd end up with an infinite loop, wouldn't
> > you?
> >
> Probably No.
>
> ( (freq >= mult-offset) && (freq <= mult) ) =>
>
> ( (freq >= mult-offset) && (freq <= mult+offset) )
>
> would mean that
>
> div = DIV_ROUND_CLOSEST(freq, desired_clk);
> evaluates to original div and we are within tolerance and hence we can return and hence don't even reach DIV_ROUND_UP?
>
> Please note that freq can skip any multiples and land up anywhere.
>
> As long as it has not gone beyond clock rate table, either it lands
> within tolerance of nearest multiple of desired_clk (in which case we
> return)
>
> OR
>
> We move on to next div = (freq/desired_clk + 1)

Ah, you are correct. So just:

div = DIV_ROUND_UP(freq, desired_clk);

...because freq _has_ to be greater than mult. If it was < "mult -
offset" we would have ended the loop. If it was between "mult -
offset" and "mult + offset" (inclusive) then we would have success. So
freq must be > "mult + offset" at the end of the loop.

-Doug
Vijaya Krishna Nivarthi July 7, 2022, 7:24 p.m. UTC | #12
On 7/6/2022 11:51 PM, Doug Anderson wrote:
> Hi,
>
> On Wed, Jul 6, 2022 at 10:44 AM Vijaya Krishna Nivarthi
> <quic_vnivarth@quicinc.com> wrote:
>> Hi,
>>
>>
>> On 7/6/2022 8:56 PM, Doug Anderson wrote:
>>> Hi,
>>>
>>> On Mon, Jul 4, 2022 at 11:57 AM Vijaya Krishna Nivarthi (Temp)
>>> <vnivarth@qti.qualcomm.com> wrote:
>>>> Hi,
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Doug Anderson <dianders@chromium.org>
>>>>> Sent: Friday, July 1, 2022 8:38 PM
>>>>> To: Vijaya Krishna Nivarthi (Temp) (QUIC) <quic_vnivarth@quicinc.com>
>>>>> Cc: Andy Gross <agross@kernel.org>; bjorn.andersson@linaro.org; Konrad
>>>>> Dybcio <konrad.dybcio@somainline.org>; Greg Kroah-Hartman
>>>>> <gregkh@linuxfoundation.org>; Jiri Slaby <jirislaby@kernel.org>; linux-arm-
>>>>> msm <linux-arm-msm@vger.kernel.org>; linux-serial@vger.kernel.org; LKML
>>>>> <linux-kernel@vger.kernel.org>; Mukesh Savaliya (QUIC)
>>>>> <quic_msavaliy@quicinc.com>; Matthias Kaehlcke <mka@chromium.org>;
>>>>> Stephen Boyd <swboyd@chromium.org>
>>>>> Subject: Re: [V2] tty: serial: qcom-geni-serial: Fix get_clk_div_rate() which
>>>>> otherwise could return a sub-optimal clock rate.
>>>>>
>>>>> WARNING: This email originated from outside of Qualcomm. Please be wary
>>>>> of any links or attachments, and do not enable macros.
>>>>>
>>>>> Hi,
>>>>>
>>>>> On Fri, Jul 1, 2022 at 4:04 AM Vijaya Krishna Nivarthi (Temp) (QUIC)
>>>>> <quic_vnivarth@quicinc.com> wrote:
>>>>>
> Ah, you are correct. So just:
>
> div = DIV_ROUND_UP(freq, desired_clk);
>
> ...because freq _has_ to be greater than mult. If it was < "mult -
> offset" we would have ended the loop. If it was between "mult -
> offset" and "mult + offset" (inclusive) then we would have success. So
> freq must be > "mult + offset" at the end of the loop.
>
> -Doug


Thank you, uploaded V3.
diff mbox series

Patch

diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index 2e23b65..d0696d1 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -943,52 +943,111 @@  static int qcom_geni_serial_startup(struct uart_port *uport)
 	return 0;
 }
 
-static unsigned long get_clk_div_rate(struct clk *clk, unsigned int baud,
-			unsigned int sampling_rate, unsigned int *clk_div)
+static unsigned long find_clk_rate_in_tol(struct clk *clk, unsigned int desired_clk,
+			unsigned int *clk_div, unsigned int percent_tol, bool exact_match)
 {
+	unsigned long freq;
+	unsigned long div, maxdiv, new_div;
+	u64 mult;
 	unsigned long ser_clk;
-	unsigned long desired_clk;
-	unsigned long freq, prev;
-	unsigned long div, maxdiv;
-	int64_t mult;
-
-	desired_clk = baud * sampling_rate;
-	if (!desired_clk) {
-		pr_err("%s: Invalid frequency\n", __func__);
-		return 0;
-	}
+	unsigned long test_freq, offset, new_freq;
 
+	ser_clk = 0;
 	maxdiv = CLK_DIV_MSK >> CLK_DIV_SHFT;
-	prev = 0;
+	div = 1;
 
-	for (div = 1; div <= maxdiv; div++) {
-		mult = div * desired_clk;
-		if (mult > ULONG_MAX)
+	while (div <= maxdiv) {
+		mult = (u64)div * desired_clk;
+		if (mult != (unsigned long)mult)
 			break;
 
-		freq = clk_round_rate(clk, (unsigned long)mult);
+		/*
+		 * Loop requesting a freq within tolerance and possibly exact freq.
+		 *
+		 * We'll keep track of the lowest freq inexact match we found
+		 * but always try to find a perfect match. NOTE: this algorithm
+		 * could miss a slightly better freq if there's more than one
+		 * freq between (freq - offset) and (freq) but (freq) can't be made
+		 * exactly, but that's OK.
+		 *
+		 * This absolutely relies on the fact that the Qualcomm clock
+		 * driver always rounds up.
+		 * We make use of exact_match as an I/O param.
+		 */
+
+		/* look only for exact match if within tolerance is already found */
+		if (ser_clk)
+			offset = 0;
+		else
+			offset = div_u64(mult * percent_tol, 100);
+
+		test_freq = mult - offset;
+		freq = clk_round_rate(clk, test_freq);
+
+		/*
+		 * A dead-on freq is an insta-win
+		 */
 		if (!(freq % desired_clk)) {
 			ser_clk = freq;
-			break;
+			*clk_div = freq / desired_clk;
+			return ser_clk;
 		}
 
-		if (!prev)
-			ser_clk = freq;
-		else if (prev == freq)
-			break;
+		if (!ser_clk) {
+			new_div = DIV_ROUND_CLOSEST(freq, desired_clk);
+			new_freq = new_div * desired_clk;
+			offset = (new_freq * percent_tol) / 100;
+
+			if (new_freq - offset <= freq && freq <= new_freq + offset) {
+				/* Save the first (lowest freq) within tolerance */
+				ser_clk = freq;
+				*clk_div = new_div;
+				/* no more search for exact match required in 2nd run */
+				if (!exact_match)
+					break;
+			}
+		}
 
-		prev = freq;
+		div = freq / desired_clk + 1;
+
+		/*
+		 * Only time clock framework doesn't round up is if
+		 * we're past the max clock rate. We're done searching
+		 * if that's the case.
+		 */
+		if (freq < test_freq)
+			break;
 	}
 
-	if (!ser_clk) {
-		pr_err("%s: Can't find matching DFS entry for baud %d\n",
-								__func__, baud);
-		return ser_clk;
+	return ser_clk;
+}
+
+static unsigned long get_clk_div_rate(struct clk *clk, unsigned int baud,
+			unsigned int sampling_rate, unsigned int *clk_div)
+{
+	unsigned long ser_clk;
+	unsigned long desired_clk;
+
+	desired_clk = baud * sampling_rate;
+	if (!desired_clk) {
+		pr_err("%s: Invalid frequency\n", __func__);
+		return 0;
 	}
 
-	*clk_div = ser_clk / desired_clk;
-	if (!(*clk_div))
-		*clk_div = 1;
+	ser_clk = 0;
+	/*
+	 * try to find exact clock rate or within 2% tolerance,
+	 * then within 5% tolerance
+	 */
+	ser_clk = find_clk_rate_in_tol(clk, desired_clk, clk_div, 2, true);
+	if (!ser_clk)
+		ser_clk = find_clk_rate_in_tol(clk, desired_clk, clk_div, 5, false);
+
+	if (!ser_clk)
+		pr_err("Couldn't find suitable clock rate for %d\n", desired_clk);
+	else
+		pr_debug("desired_clk-%d, ser_clk-%d, clk_div-%d\n",
+			desired_clk, ser_clk, *clk_div);
 
 	return ser_clk;
 }
@@ -1021,8 +1080,7 @@  static void qcom_geni_serial_set_termios(struct uart_port *uport,
 	if (ver >= QUP_SE_VERSION_2_5)
 		sampling_rate /= 2;
 
-	clk_rate = get_clk_div_rate(port->se.clk, baud,
-		sampling_rate, &clk_div);
+	clk_rate = get_clk_div_rate(port->se.clk, baud, sampling_rate, &clk_div);
 	if (!clk_rate)
 		goto out_restart_rx;