diff mbox series

[v7,2/7] clocksource/drivers: Add a new driver for the Atmel ARM TC blocks

Message ID 20180913113024.3571-3-alexandre.belloni@bootlin.com (mailing list archive)
State New, archived
Headers show
Series clocksource: rework Atmel TCB timer driver | expand

Commit Message

Alexandre Belloni Sept. 13, 2018, 11:30 a.m. UTC
Add a driver for the Atmel Timer Counter Blocks. This driver provides a
clocksource and two clockevent devices.

One of the clockevent device is linked to the clocksource counter and so it
will run at the same frequency. This will be used when there is only on TCB
channel available for timers.

The other clockevent device runs on a separate TCB channel when available.

This driver uses regmap and syscon to be able to probe early in the boot
and avoid having to switch on the TCB clocksource later. Using regmap also
means that unused TCB channels may be used by other drivers (PWM for
example). read/writel are still used to access channel specific registers
to avoid the performance impact of regmap (mainly locking).

Tested-by: Alexander Dahl <ada@thorsis.com>
Tested-by: Andras Szemzo <szemzo.andras@gmail.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/clocksource/Kconfig           |   8 +
 drivers/clocksource/Makefile          |   3 +-
 drivers/clocksource/timer-atmel-tcb.c | 410 ++++++++++++++++++++++++++
 3 files changed, 420 insertions(+), 1 deletion(-)
 create mode 100644 drivers/clocksource/timer-atmel-tcb.c

Comments

Daniel Lezcano Sept. 24, 2018, 1:59 a.m. UTC | #1
On 13/09/2018 13:30, Alexandre Belloni wrote:
> Add a driver for the Atmel Timer Counter Blocks. This driver provides a
> clocksource and two clockevent devices.
> 
> One of the clockevent device is linked to the clocksource counter and so it
> will run at the same frequency. This will be used when there is only on TCB
> channel available for timers.
> 
> The other clockevent device runs on a separate TCB channel when available.
> 
> This driver uses regmap and syscon to be able to probe early in the boot
> and avoid having to switch on the TCB clocksource later. 

Sorry, I don't get it. Can you elaborate?

> Using regmap also
> means that unused TCB channels may be used by other drivers (PWM for
> example). read/writel are still used to access channel specific registers
> to avoid the performance impact of regmap (mainly locking).

I don't get the regmap reasoning here.

My main concern with this driver is the 16bits chained support. See
below in the comments.


> Tested-by: Alexander Dahl <ada@thorsis.com>
> Tested-by: Andras Szemzo <szemzo.andras@gmail.com>
> Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
> ---
>  drivers/clocksource/Kconfig           |   8 +
>  drivers/clocksource/Makefile          |   3 +-
>  drivers/clocksource/timer-atmel-tcb.c | 410 ++++++++++++++++++++++++++
>  3 files changed, 420 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/clocksource/timer-atmel-tcb.c
> 
> diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
> index a11f4ba98b05..8c7324e409f6 100644
> --- a/drivers/clocksource/Kconfig
> +++ b/drivers/clocksource/Kconfig
> @@ -403,6 +403,14 @@ config ATMEL_ST
>  	help
>  	  Support for the Atmel ST timer.
>  
> +config ATMEL_ARM_TCB_CLKSRC
> +	bool "Microchip ARM TC Block" if COMPILE_TEST
> +	select REGMAP_MMIO
> +	depends on GENERIC_CLOCKEVENTS
> +	help
> +	  This enables build of clocksource and clockevent driver for
> +	  the integrated Timer Counter Blocks in Microchip ARM SoCs.
> +
>  config CLKSRC_EXYNOS_MCT
>  	bool "Exynos multi core timer driver" if COMPILE_TEST
>  	depends on ARM || ARM64
> diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
> index db51b2427e8a..0df9384a1230 100644
> --- a/drivers/clocksource/Makefile
> +++ b/drivers/clocksource/Makefile
> @@ -3,7 +3,8 @@ obj-$(CONFIG_TIMER_OF)		+= timer-of.o
>  obj-$(CONFIG_TIMER_PROBE)	+= timer-probe.o
>  obj-$(CONFIG_ATMEL_PIT)		+= timer-atmel-pit.o
>  obj-$(CONFIG_ATMEL_ST)		+= timer-atmel-st.o
> -obj-$(CONFIG_ATMEL_TCB_CLKSRC)	+= tcb_clksrc.o
> +obj-$(CONFIG_ATMEL_TCB_CLKSRC) += tcb_clksrc.o
> +obj-$(CONFIG_ATMEL_ARM_TCB_CLKSRC)	+= timer-atmel-tcb.o
>  obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
>  obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
>  obj-$(CONFIG_CS5535_CLOCK_EVENT_SRC)	+= cs5535-clockevt.o
> diff --git a/drivers/clocksource/timer-atmel-tcb.c b/drivers/clocksource/timer-atmel-tcb.c
> new file mode 100644
> index 000000000000..21fbe430f91b
> --- /dev/null
> +++ b/drivers/clocksource/timer-atmel-tcb.c
> @@ -0,0 +1,410 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/clk.h>
> +#include <linux/clockchips.h>
> +#include <linux/clocksource.h>
> +#include <linux/interrupt.h>
> +#include <linux/kernel.h>
> +#include <linux/mfd/syscon.h>
> +#include <linux/of_address.h>
> +#include <linux/of_irq.h>
> +#include <linux/regmap.h>
> +#include <linux/sched_clock.h>
> +#include <soc/at91/atmel_tcb.h>
> +
> +struct atmel_tcb_clksrc {
> +	struct clocksource clksrc;
> +	struct clock_event_device clkevt;
> +	struct regmap *regmap;
> +	void __iomem *base;
> +	struct clk *clk[2];
> +	char name[20];

You can reasonably remove this field and use directly the ones in the
clocksrc/evt.

> +	int channels[2];
> +	int bits;
> +	int irq;

After removing the request_irq/free_irq calls below (see comment), this
field can be removed.

> +	struct {
> +		u32 cmr;
> +		u32 imr;
> +		u32 rc;
> +		bool clken;

Not sure clken is needed, 16/32 is enough information.

> +	} cache[2];
> +	u32 bmr_cache;

Are you sure you should save the bmr content ?

> +	bool registered;
> +	bool clk_enabled;

Not used.

> +};
> +
> +static struct atmel_tcb_clksrc tc;
> +
> +static struct clk *tcb_clk_get(struct device_node *node, int channel)
> +{
> +	struct clk *clk;
> +	char clk_name[] = "t0_clk";
> +
> +	clk_name[1] += channel;
> +	clk = of_clk_get_by_name(node->parent, clk_name);
> +	if (!IS_ERR(clk))
> +		return clk;
> +
> +	return of_clk_get_by_name(node->parent, "t0_clk");

Can you explain why returning "t0_clk" is better than returning an error?

> +}
> +
> +/*
> + * Clocksource and clockevent using the same channel(s)
> + */
> +static u64 tc_get_cycles(struct clocksource *cs)
> +{
> +	u32 lower, upper;
> +
> +	do {
> +		upper = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1]));
> +		lower = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
> +	} while (upper != readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1])));
> +
> +	return (upper << 16) | lower;
> +}
> +
> +static u64 tc_get_cycles32(struct clocksource *cs)
> +{
> +	return readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
> +}
> +
> +static u64 notrace tc_sched_clock_read(void)
> +{
> +	return tc_get_cycles(&tc.clksrc);
> +}
> +
> +static u64 notrace tc_sched_clock_read32(void)
> +{
> +	return tc_get_cycles32(&tc.clksrc);
> +}
> +
> +static int tcb_clkevt_next_event(unsigned long delta,
> +				 struct clock_event_device *d)
> +{
> +	u32 old, next, cur;
> +
> +	old = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
> +	next = old + delta;
> +	writel(next, tc.base + ATMEL_TC_RC(tc.channels[0]));
> +	cur = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
> +
> +	/* check whether the delta elapsed while setting the register */
> +	if ((next < old && cur < old && cur > next) ||
> +	    (next > old && (cur < old || cur > next))) {
> +		/*
> +		 * Clear the CPCS bit in the status register to avoid
> +		 * generating a spurious interrupt next time a valid
> +		 * timer event is configured.
> +		 */
> +		old = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
> +		return -ETIME;
> +	}
> > +	writel(ATMEL_TC_CPCS, tc.base + ATMEL_TC_IER(tc.channels[0]));


How this is compatible with 16bits as defined in the init function ?

> +	return 0;
> +}
> +
> +static irqreturn_t tc_clkevt_irq(int irq, void *handle)
> +{
> +	unsigned int sr;
> +
> +	sr = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
> +	if (sr & ATMEL_TC_CPCS) {
> +		tc.clkevt.event_handler(&tc.clkevt);
> +		return IRQ_HANDLED;
> +	}
> +
> +	return IRQ_NONE;
> +}
> +
> +static int tcb_clkevt_oneshot(struct clock_event_device *dev)
> +{
> +	if (clockevent_state_oneshot(dev))
> +		return 0;
> +
> +	/*
> +	 * Because both clockevent devices may share the same IRQ, we don't want
> +	 * the less likely one to stay requested
> +	 */
> +	return request_irq(tc.irq, tc_clkevt_irq, IRQF_TIMER | IRQF_SHARED,
> +			   tc.name, &tc);
> +}
> +
> +static int tcb_clkevt_shutdown(struct clock_event_device *dev)
> +{
> +	writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[0]));
> +	if (tc.bits == 16)
> +		writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[1]));
> +
> +	if (!clockevent_state_detached(dev))
> +		free_irq(tc.irq, &tc);

Why are you requesting and freeing the irq instead of using the
disable/enable register operations ?

> +	return 0;
> +}
> +
> +static void __init tcb_setup_dual_chan(struct atmel_tcb_clksrc *tc,
> +				       int mck_divisor_idx)
> +{
> +	/* first channel: waveform mode, input mclk/8, clock TIOA on overflow */
> +	writel(mck_divisor_idx			/* likely divide-by-8 */
> +	       | ATMEL_TC_CMR_WAVE
> +	       | ATMEL_TC_CMR_WAVESEL_UP	/* free-run */
> +	       | ATMEL_TC_CMR_ACPA(SET)		/* TIOA rises at 0 */
> +	       | ATMEL_TC_CMR_ACPC(CLEAR),	/* (duty cycle 50%) */
> +	       tc->base + ATMEL_TC_CMR(tc->channels[0]));
> +	writel(0x0000, tc->base + ATMEL_TC_RA(tc->channels[0]));
> +	writel(0x8000, tc->base + ATMEL_TC_RC(tc->channels[0]));
> +	writel(0xff, tc->base + ATMEL_TC_IDR(tc->channels[0]));	/* no irqs */
> +	writel(ATMEL_TC_CCR_CLKEN, tc->base + ATMEL_TC_CCR(tc->channels[0]));
> +
> +	/* second channel: waveform mode, input TIOA */
> +	writel(ATMEL_TC_CMR_XC(tc->channels[1])		/* input: TIOA */
> +	       | ATMEL_TC_CMR_WAVE
> +	       | ATMEL_TC_CMR_WAVESEL_UP,		/* free-run */
> +	       tc->base + ATMEL_TC_CMR(tc->channels[1]));
> +	writel(0xff, tc->base + ATMEL_TC_IDR(tc->channels[1]));	/* no irqs */
> +	writel(ATMEL_TC_CCR_CLKEN, tc->base + ATMEL_TC_CCR(tc->channels[1]));
> +
> +	/* chain both channel, we assume the previous channel */
> +	regmap_write(tc->regmap, ATMEL_TC_BMR,
> +		     ATMEL_TC_BMR_TCXC(1 + tc->channels[1], tc->channels[1]));
> +	/* then reset all the timers */
> +	regmap_write(tc->regmap, ATMEL_TC_BCR, ATMEL_TC_BCR_SYNC);
> +}
> +
> +static void __init tcb_setup_single_chan(struct atmel_tcb_clksrc *tc,
> +					 int mck_divisor_idx)
> +{
> +	/* channel 0:  waveform mode, input mclk/8 */
> +	writel(mck_divisor_idx			/* likely divide-by-8 */
> +	       | ATMEL_TC_CMR_WAVE
> +	       | ATMEL_TC_CMR_WAVESEL_UP,	/* free-run */
> +	       tc->base + ATMEL_TC_CMR(tc->channels[0]));
> +	writel(0xff, tc->base + ATMEL_TC_IDR(tc->channels[0]));	/* no irqs */
> +	writel(ATMEL_TC_CCR_CLKEN, tc->base + ATMEL_TC_CCR(tc->channels[0]));
> +
> +	/* then reset all the timers */
> +	regmap_write(tc->regmap, ATMEL_TC_BCR, ATMEL_TC_BCR_SYNC);
> +}
> +
> +static void tc_clksrc_suspend(struct clocksource *cs)
> +{
> +	int i;
> +
> +	for (i = 0; i < 1 + (tc.bits == 16); i++) {
> +		tc.cache[i].cmr = readl(tc.base + ATMEL_TC_CMR(tc.channels[i]));
> +		tc.cache[i].imr = readl(tc.base + ATMEL_TC_IMR(tc.channels[i]));
> +		tc.cache[i].rc = readl(tc.base + ATMEL_TC_RC(tc.channels[i]));
> +		tc.cache[i].clken = !!(readl(tc.base +
> +					     ATMEL_TC_SR(tc.channels[i])) &
> +				       ATMEL_TC_CLKSTA);
> +	}
> +
> +	if (tc.bits == 16)
> +		regmap_read(tc.regmap, ATMEL_TC_BMR, &tc.bmr_cache);
> +}
> +
> +static void tc_clksrc_resume(struct clocksource *cs)
> +{
> +	int i;
> +
> +	for (i = 0; i < 1 + (tc.bits == 16); i++) {
> +		/* Restore registers for the channel, RA and RB are not used  */
> +		writel(tc.cache[i].cmr, tc.base + ATMEL_TC_CMR(tc.channels[i]));
> +		writel(tc.cache[i].rc, tc.base + ATMEL_TC_RC(tc.channels[i]));
> +		writel(0, tc.base + ATMEL_TC_RA(tc.channels[i]));
> +		writel(0, tc.base + ATMEL_TC_RB(tc.channels[i]));
> +		/* Disable all the interrupts */
> +		writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[i]));
> +		/* Reenable interrupts that were enabled before suspending */
> +		writel(tc.cache[i].imr, tc.base + ATMEL_TC_IER(tc.channels[i]));
> +
> +		/* Start the clock if it was used */
> +		if (tc.cache[i].clken)
> +			writel(ATMEL_TC_CCR_CLKEN, tc.base +
> +			       ATMEL_TC_CCR(tc.channels[i]));
> +	}
> +
> +	/* in case of dual channel, chain channels */
> +	if (tc.bits == 16)
> +		regmap_write(tc.regmap, ATMEL_TC_BMR, tc.bmr_cache);
> +	/* Finally, trigger all the channels*/
> +	regmap_write(tc.regmap, ATMEL_TC_BCR, ATMEL_TC_BCR_SYNC);
> +}
> +
> +static int __init tcb_clksrc_register(struct device_node *node,
> +				      struct regmap *regmap, void __iomem *base,
> +				      int channel, int channel1, int irq,
> +				      int bits)
> +{
> +	u32 rate, divided_rate = 0;
> +	int best_divisor_idx = -1;
> +	int i, err = -1;
> +	u64 (*tc_sched_clock)(void);
> +
> +	tc.regmap = regmap;
> +	tc.base = base;
> +	tc.channels[0] = channel;
> +	tc.channels[1] = channel1;
> +	tc.irq = irq;
> +	tc.bits = bits;
> +
> +	tc.clk[0] = tcb_clk_get(node, tc.channels[0]);
> +	if (IS_ERR(tc.clk[0]))
> +		return PTR_ERR(tc.clk[0]);
> +	err = clk_prepare_enable(tc.clk[0]);
> +	if (err) {
> +		pr_debug("can't enable T0 clk\n");
> +		goto err_clk;
> +	}
> +
> +	/* How fast will we be counting?  Pick something over 5 MHz.  */
> +	rate = (u32)clk_get_rate(tc.clk[0]);
> +	for (i = 0; i < 5; i++) {
> +		unsigned int divisor = atmel_tc_divisors[i];
> +		unsigned int tmp;
> +
> +		if (!divisor)
> +			continue;

I suppose you meant here 'break' ? Use atmel_tc_divisors[] = { 2, 8, 32,
128 }; And then the ARRAY_SIZE macro.

> +		tmp = rate / divisor;
> +		pr_debug("TC: %u / %-3u [%d] --> %u\n", rate, divisor, i, tmp);
> +		if (best_divisor_idx > 0) {
> +			if (tmp < 5 * 1000 * 1000)
> +				continue;
> +		}
> +		divided_rate = tmp;
> +		best_divisor_idx = i;

What is a best divisor ? The highest one or the one closer to 5MHz ?

> +	}
> +
> +	if (tc.bits == 32) {
> +		tc.clksrc.read = tc_get_cycles32;
> +		tcb_setup_single_chan(&tc, best_divisor_idx);
> +		tc_sched_clock = tc_sched_clock_read32;
> +		snprintf(tc.name, sizeof(tc.name), "%s:%d",
> +			 kbasename(node->parent->full_name), tc.channels[0]);
> +	} else {
> +		tc.clk[1] = tcb_clk_get(node, tc.channels[1]);
> +		if (IS_ERR(tc.clk[1]))
> +			goto err_disable_t0;

This is very confusing. If the function tcb_clk_get() fails with this
channel, it will return "t0_clk" and will be used here ? Why ?

> +		err = clk_prepare_enable(tc.clk[1]);
> +		if (err) {
> +			pr_debug("can't enable T1 clk\n");
> +			goto err_clk1;
> +		}
> +		tc.clksrc.read = tc_get_cycles,
> +		tcb_setup_dual_chan(&tc, best_divisor_idx);
> +		tc_sched_clock = tc_sched_clock_read;
> +		snprintf(tc.name, sizeof(tc.name), "%s:%d,%d",
> +			 kbasename(node->parent->full_name), tc.channels[0],
> +			 tc.channels[1]);
> +	}
> +
> +	pr_debug("%s at %d.%03d MHz\n", tc.name,
> +		 divided_rate / 1000000,
> +		 ((divided_rate + 500000) % 1000000) / 1000);

Using two channels to emulate a 32bits timer has a significant cost,
especially in the sched_clock function which is part of the hot kernel
path. In addition it makes the code less maintainable and readable.

Why don't you just stick to a specific rate with the prescalar value and
reduce the rating of the timer ? (example in the stm32 timer,
stm32_timer_set_prescaler and init function).

It will be less precise (thus the lower rating) but will make the system
faster by preventing multiple register reads in the sched_clock.

Is it an acceptable trade-off ?

> +	tc.clksrc.name = tc.name;
> +	tc.clksrc.suspend = tc_clksrc_suspend;
> +	tc.clksrc.resume = tc_clksrc_resume;
> +	tc.clksrc.rating = 200;
> +	tc.clksrc.mask = CLOCKSOURCE_MASK(32);
> +	tc.clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
> +
> +	err = clocksource_register_hz(&tc.clksrc, divided_rate);
> +	if (err)
> +		goto err_disable_t1;
> +
> +	sched_clock_register(tc_sched_clock, 32, divided_rate);
> +
> +	tc.registered = true;
> +
> +	/* Set up and register clockevents */
> +	tc.clkevt.name = tc.name;
> +	tc.clkevt.cpumask = cpumask_of(0);
> +	tc.clkevt.set_next_event = tcb_clkevt_next_event;
> +	tc.clkevt.set_state_oneshot = tcb_clkevt_oneshot;
> +	tc.clkevt.set_state_shutdown = tcb_clkevt_shutdown;
> +	tc.clkevt.features = CLOCK_EVT_FEAT_ONESHOT;
> +	tc.clkevt.rating = 125;
> +
> +	clockevents_config_and_register(&tc.clkevt, divided_rate, 1,
> +					BIT(tc.bits) - 1);
> +
> +	return 0;
> +
> +err_disable_t1:
> +	if (tc.bits == 16)
> +		clk_disable_unprepare(tc.clk[1]);
> +
> +err_clk1:
> +	if (tc.bits == 16)
> +		clk_put(tc.clk[1]);
> +
> +err_disable_t0:
> +	clk_disable_unprepare(tc.clk[0]);
> +
> +err_clk:
> +	clk_put(tc.clk[0]);
> +
> +	pr_err("%s: unable to register clocksource/clockevent\n",
> +	       tc.clksrc.name);
> +
> +	return err;
> +}
> +
> +static int __init tcb_clksrc_init(struct device_node *node)
> +{
> +	const struct of_device_id *match;
> +	struct regmap *regmap;
> +	void __iomem *tcb_base;
> +	u32 channel;
> +	int irq, err, chan1 = -1;
> +	unsigned bits;
> +
> +	if (tc.registered)
> +		return -ENODEV;
> +
> +	/*
> +	 * The regmap has to be used to access registers that are shared
> +	 * between channels on the same TCB but we keep direct IO access for
> +	 * the counters to avoid the impact on performance
> +	 */
> +	regmap = syscon_node_to_regmap(node->parent);
> +	if (IS_ERR(regmap))
> +		return PTR_ERR(regmap);
> +
> +	tcb_base = of_iomap(node->parent, 0);
> +	if (!tcb_base) {
> +		pr_err("%s +%d %s\n", __FILE__, __LINE__, __func__);

Remove those debug information and replace them by a proper error message.

> +		return -ENXIO;
> +	}
> +
> +	match = of_match_node(atmel_tcb_dt_ids, node->parent);
> +	bits = (uintptr_t)match->data;
> +
> +	err = of_property_read_u32_index(node, "reg", 0, &channel);
> +	if (err)
> +		return err;
> +
> +	irq = of_irq_get(node->parent, channel);
> +	if (irq < 0) {

if (irq <= 0) {

> +		irq = of_irq_get(node->parent, 0);

Why ?

> +		if (irq < 0)

if (irq <= 0) {

> +			return irq;
> +	}
> +
> +	if (bits == 16) {
> +		of_property_read_u32_index(node, "reg", 1, &chan1);
> +		if (chan1 == -1) {
> +			pr_err("%s: clocksource needs two channels\n",
> +			       node->parent->full_name);

Think about it. The code is giving up at this point in the boot process.
So of two things, you consider there is an alternate clocksource /
clockevent or the system hangs:

 - If there is an alternate clocksource why support 32bits by chaining
the channels with the cost it introduces instead of using the alternate
one ?

 - If there is no alternate clocksource why not support a 16bits less
precise timer and give up with the 32bits emulation and the complexity
it introduces in this driver ?

> +			return -EINVAL;
> +		}
> +	}
> +	return tcb_clksrc_register(node, regmap, tcb_base, channel, chan1, irq,
> +				   bits);
> +}
> +TIMER_OF_DECLARE(atmel_tcb_clksrc, "atmel,tcb-timer", tcb_clksrc_init);
>
Alexandre Belloni Sept. 25, 2018, 9:16 p.m. UTC | #2
On 24/09/2018 03:59:55+0200, Daniel Lezcano wrote:
> On 13/09/2018 13:30, Alexandre Belloni wrote:
> > Add a driver for the Atmel Timer Counter Blocks. This driver provides a
> > clocksource and two clockevent devices.
> > 
> > One of the clockevent device is linked to the clocksource counter and so it
> > will run at the same frequency. This will be used when there is only on TCB
> > channel available for timers.
> > 
> > The other clockevent device runs on a separate TCB channel when available.
> > 
> > This driver uses regmap and syscon to be able to probe early in the boot
> > and avoid having to switch on the TCB clocksource later. 
> 
> Sorry, I don't get it. Can you elaborate?
> 

The current existing way of sharing TCB channels is getting probed to
late in the boot process to be used as the clocksource so currently, the
PIT is necessary to act as the clocksource until the TCB clocksource can
be probed.

This is a big issue for SoCs without a PIT, they simply can't boot.

This also solves:
33d8c15559df Revert "clocksource/drivers/tcb_clksrc: Use 32 bit tcb as sched_clock"
7b9f1d16e6d1 clocksource/drivers/tcb_clksrc: Use 32 bit tcb as sched_clock


> > Using regmap also
> > means that unused TCB channels may be used by other drivers (PWM for
> > example). read/writel are still used to access channel specific registers
> > to avoid the performance impact of regmap (mainly locking).
> 
> I don't get the regmap reasoning here.

Because there are 3 channels per TCB, some TCB can have channels handled
by different drivers (say channel 0 for clocksource, channel 1 for
clockevent and channel 2 for PWM). There are configuration registers that
are shared for all the channels and so the access needs to be handled
properly. But as we discussed on a previous version of the patch, we
don't want to lock/unlock each time we read the clocksource so for the
channel specific registers, readl/writel is used directly.

> 
> My main concern with this driver is the 16bits chained support. See
> below in the comments.
> 
> 
> > +struct atmel_tcb_clksrc {
> > +	struct clocksource clksrc;
> > +	struct clock_event_device clkevt;
> > +	struct regmap *regmap;
> > +	void __iomem *base;
> > +	struct clk *clk[2];
> > +	char name[20];
> 
> You can reasonably remove this field and use directly the ones in the
> clocksrc/evt.
> 

name in struct clocksource is a pointer to a string, we still need a
place to store that string.

> > +	int channels[2];
> > +	int bits;
> > +	int irq;
> 
> After removing the request_irq/free_irq calls below (see comment), this
> field can be removed.
> 
> > +	struct {
> > +		u32 cmr;
> > +		u32 imr;
> > +		u32 rc;
> > +		bool clken;
> 
> Not sure clken is needed, 16/32 is enough information.
> 

This as nothing to do with 16/32. We always need to now whether the
timer was running or not.

> > +	} cache[2];
> > +	u32 bmr_cache;
> 
> Are you sure you should save the bmr content ?
> 

We need to restore at least part of it. We may need to be more clever
about it but this is the current behaviour and it has been working fine.

> > +	bool registered;
> > +	bool clk_enabled;
> 
> Not used.
> 

I guess they are use in the following patch.

> > +};
> > +
> > +static struct atmel_tcb_clksrc tc;
> > +
> > +static struct clk *tcb_clk_get(struct device_node *node, int channel)
> > +{
> > +	struct clk *clk;
> > +	char clk_name[] = "t0_clk";
> > +
> > +	clk_name[1] += channel;
> > +	clk = of_clk_get_by_name(node->parent, clk_name);
> > +	if (!IS_ERR(clk))
> > +		return clk;
> > +
> > +	return of_clk_get_by_name(node->parent, "t0_clk");
> 
> Can you explain why returning "t0_clk" is better than returning an error?
> 

This is the current tclib behavior and doing otherwise would break the
DT ABI.
The reason for this behavior is that some TCB may have a clock
per channel while others have one clock for the whole block.

> > +}
> > +
> > +/*
> > + * Clocksource and clockevent using the same channel(s)
> > + */
> > +static u64 tc_get_cycles(struct clocksource *cs)
> > +{
> > +	u32 lower, upper;
> > +
> > +	do {
> > +		upper = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1]));
> > +		lower = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
> > +	} while (upper != readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1])));
> > +
> > +	return (upper << 16) | lower;
> > +}
> > +
> > +static u64 tc_get_cycles32(struct clocksource *cs)
> > +{
> > +	return readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
> > +}
> > +
> > +static u64 notrace tc_sched_clock_read(void)
> > +{
> > +	return tc_get_cycles(&tc.clksrc);
> > +}
> > +
> > +static u64 notrace tc_sched_clock_read32(void)
> > +{
> > +	return tc_get_cycles32(&tc.clksrc);
> > +}
> > +
> > +static int tcb_clkevt_next_event(unsigned long delta,
> > +				 struct clock_event_device *d)
> > +{
> > +	u32 old, next, cur;
> > +
> > +	old = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
> > +	next = old + delta;
> > +	writel(next, tc.base + ATMEL_TC_RC(tc.channels[0]));
> > +	cur = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
> > +
> > +	/* check whether the delta elapsed while setting the register */
> > +	if ((next < old && cur < old && cur > next) ||
> > +	    (next > old && (cur < old || cur > next))) {
> > +		/*
> > +		 * Clear the CPCS bit in the status register to avoid
> > +		 * generating a spurious interrupt next time a valid
> > +		 * timer event is configured.
> > +		 */
> > +		old = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
> > +		return -ETIME;
> > +	}
> > > +	writel(ATMEL_TC_CPCS, tc.base + ATMEL_TC_IER(tc.channels[0]));
> 
> 
> How this is compatible with 16bits as defined in the init function ?
> 

This is working fine because it is the lower bits channel and in that
case, clockevents_config_and_register is call with the proper mask (16
lower bits sets).

> > +	return 0;
> > +}
> > +
> > +static irqreturn_t tc_clkevt_irq(int irq, void *handle)
> > +{
> > +	unsigned int sr;
> > +
> > +	sr = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
> > +	if (sr & ATMEL_TC_CPCS) {
> > +		tc.clkevt.event_handler(&tc.clkevt);
> > +		return IRQ_HANDLED;
> > +	}
> > +
> > +	return IRQ_NONE;
> > +}
> > +
> > +static int tcb_clkevt_oneshot(struct clock_event_device *dev)
> > +{
> > +	if (clockevent_state_oneshot(dev))
> > +		return 0;
> > +
> > +	/*
> > +	 * Because both clockevent devices may share the same IRQ, we don't want
> > +	 * the less likely one to stay requested
> > +	 */
> > +	return request_irq(tc.irq, tc_clkevt_irq, IRQF_TIMER | IRQF_SHARED,
> > +			   tc.name, &tc);
> > +}
> > +
> > +static int tcb_clkevt_shutdown(struct clock_event_device *dev)
> > +{
> > +	writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[0]));
> > +	if (tc.bits == 16)
> > +		writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[1]));
> > +
> > +	if (!clockevent_state_detached(dev))
> > +		free_irq(tc.irq, &tc);
> 
> Why are you requesting and freeing the irq instead of using the
> disable/enable register operations ?
> 

To avoid going through two interrupt handlers when we know that one is
never used (that is when we have a separate channel for the clockevent,
see following patch).

> > +	/* How fast will we be counting?  Pick something over 5 MHz.  */
> > +	rate = (u32)clk_get_rate(tc.clk[0]);
> > +	for (i = 0; i < 5; i++) {
> > +		unsigned int divisor = atmel_tc_divisors[i];
> > +		unsigned int tmp;
> > +
> > +		if (!divisor)
> > +			continue;
> 
> I suppose you meant here 'break' ? Use atmel_tc_divisors[] = { 2, 8, 32,
> 128 }; And then the ARRAY_SIZE macro.
> 
> > +		tmp = rate / divisor;
> > +		pr_debug("TC: %u / %-3u [%d] --> %u\n", rate, divisor, i, tmp);
> > +		if (best_divisor_idx > 0) {
> > +			if (tmp < 5 * 1000 * 1000)
> > +				continue;
> > +		}
> > +		divided_rate = tmp;
> > +		best_divisor_idx = i;
> 
> What is a best divisor ? The highest one or the one closer to 5MHz ?
> 

The whole divisor selection is coming for the previous driver and I'd
rather not change it at this point, this is the topic of an other
series.

It chooses the first divisor that gives a counting rate over 5MHz


> > +	}
> > +
> > +	if (tc.bits == 32) {
> > +		tc.clksrc.read = tc_get_cycles32;
> > +		tcb_setup_single_chan(&tc, best_divisor_idx);
> > +		tc_sched_clock = tc_sched_clock_read32;
> > +		snprintf(tc.name, sizeof(tc.name), "%s:%d",
> > +			 kbasename(node->parent->full_name), tc.channels[0]);
> > +	} else {
> > +		tc.clk[1] = tcb_clk_get(node, tc.channels[1]);
> > +		if (IS_ERR(tc.clk[1]))
> > +			goto err_disable_t0;
> 
> This is very confusing. If the function tcb_clk_get() fails with this
> channel, it will return "t0_clk" and will be used here ? Why ?
> 

See earlier explanation.

> > +		err = clk_prepare_enable(tc.clk[1]);
> > +		if (err) {
> > +			pr_debug("can't enable T1 clk\n");
> > +			goto err_clk1;
> > +		}
> > +		tc.clksrc.read = tc_get_cycles,
> > +		tcb_setup_dual_chan(&tc, best_divisor_idx);
> > +		tc_sched_clock = tc_sched_clock_read;
> > +		snprintf(tc.name, sizeof(tc.name), "%s:%d,%d",
> > +			 kbasename(node->parent->full_name), tc.channels[0],
> > +			 tc.channels[1]);
> > +	}
> > +
> > +	pr_debug("%s at %d.%03d MHz\n", tc.name,
> > +		 divided_rate / 1000000,
> > +		 ((divided_rate + 500000) % 1000000) / 1000);
> 
> Using two channels to emulate a 32bits timer has a significant cost,
> especially in the sched_clock function which is part of the hot kernel
> path. In addition it makes the code less maintainable and readable.
> 
> Why don't you just stick to a specific rate with the prescalar value and
> reduce the rating of the timer ? (example in the stm32 timer,
> stm32_timer_set_prescaler and init function).
> 
> It will be less precise (thus the lower rating) but will make the system
> faster by preventing multiple register reads in the sched_clock.
> 
> Is it an acceptable trade-off ?
> 

Not at this point, the goal is to not change the current behaviour.
Some customer rely on the fast timer (they are bitbanging some RF
protocols) and counting at more that 5MHz using a 16 bit timer is
definitively too fast.

This is something that could be changed once we implement timer rate
selection (but I doubt it will make the code more readable).

I'm not saying we shouldn't question what was done 10 years ago but I'd
rather not change it is this series.

Also, the goal is to get rid of the tcb_clksrc driver now that avr32 is
gone. This will be done once the pwm driver is converted (I did that in
v1).

> > +	tcb_base = of_iomap(node->parent, 0);
> > +	if (!tcb_base) {
> > +		pr_err("%s +%d %s\n", __FILE__, __LINE__, __func__);
> 
> Remove those debug information and replace them by a proper error message.
> 

My mistake, this will be simply removed.

> > +		return -ENXIO;
> > +	}
> > +
> > +	match = of_match_node(atmel_tcb_dt_ids, node->parent);
> > +	bits = (uintptr_t)match->data;
> > +
> > +	err = of_property_read_u32_index(node, "reg", 0, &channel);
> > +	if (err)
> > +		return err;
> > +
> > +	irq = of_irq_get(node->parent, channel);
> > +	if (irq < 0) {
> 
> if (irq <= 0) {
> 
> > +		irq = of_irq_get(node->parent, 0);
> 
> Why ?
> 

See the binding, the timer is a child of the TCB and the TCB node has
the irq info. So, the TCB is defined in the dtsi and the child nodes are
in the board dts.

> > +		if (irq < 0)
> 
> if (irq <= 0) {
> 
> > +			return irq;
> > +	}
> > +
> > +	if (bits == 16) {
> > +		of_property_read_u32_index(node, "reg", 1, &chan1);
> > +		if (chan1 == -1) {
> > +			pr_err("%s: clocksource needs two channels\n",
> > +			       node->parent->full_name);
> 
> Think about it. The code is giving up at this point in the boot process.
> So of two things, you consider there is an alternate clocksource /
> clockevent or the system hangs:
> 
>  - If there is an alternate clocksource why support 32bits by chaining
> the channels with the cost it introduces instead of using the alternate
> one ?
> 

The PIT is almost always the worse clocksource as it is very slow.

>  - If there is no alternate clocksource why not support a 16bits less
> precise timer and give up with the 32bits emulation and the complexity
> it introduces in this driver ?
> 

If there is not alternate clocksource, the TCB is 32bit.
Daniel Lezcano Oct. 1, 2018, 9:24 p.m. UTC | #3
On 25/09/2018 23:16, Alexandre Belloni wrote:
> On 24/09/2018 03:59:55+0200, Daniel Lezcano wrote:
>> On 13/09/2018 13:30, Alexandre Belloni wrote:
>>> Add a driver for the Atmel Timer Counter Blocks. This driver provides a
>>> clocksource and two clockevent devices.
>>>
>>> One of the clockevent device is linked to the clocksource counter and so it
>>> will run at the same frequency. This will be used when there is only on TCB
>>> channel available for timers.
>>>
>>> The other clockevent device runs on a separate TCB channel when available.
>>>
>>> This driver uses regmap and syscon to be able to probe early in the boot
>>> and avoid having to switch on the TCB clocksource later. 
>>
>> Sorry, I don't get it. Can you elaborate?
>>
> 
> The current existing way of sharing TCB channels is getting probed to
> late in the boot process to be used as the clocksource so currently, the
> PIT is necessary to act as the clocksource until the TCB clocksource can
> be probed.
> 
> This is a big issue for SoCs without a PIT, they simply can't boot.

I'm still missing the point. The timer (clocksource + clocksource) is
probed very early with TIMER_OF_DECLARE.


> This also solves:
> 33d8c15559df Revert "clocksource/drivers/tcb_clksrc: Use 32 bit tcb as sched_clock"
> 7b9f1d16e6d1 clocksource/drivers/tcb_clksrc: Use 32 bit tcb as sched_clock
> 
> 
>>> Using regmap also
>>> means that unused TCB channels may be used by other drivers (PWM for
>>> example). read/writel are still used to access channel specific registers
>>> to avoid the performance impact of regmap (mainly locking).
>>
>> I don't get the regmap reasoning here.
> 
> Because there are 3 channels per TCB, some TCB can have channels handled
> by different drivers (say channel 0 for clocksource, channel 1 for
> clockevent and channel 2 for PWM). There are configuration registers that
> are shared for all the channels and so the access needs to be handled
> properly. But as we discussed on a previous version of the patch, we
> don't want to lock/unlock each time we read the clocksource so for the
> channel specific registers, readl/writel is used directly.

Can you point me to the code where we have racy access to the
ATMEL_TC_BMR register ?


>> My main concern with this driver is the 16bits chained support. See
>> below in the comments.
>>
>>
>>> +struct atmel_tcb_clksrc {
>>> +	struct clocksource clksrc;
>>> +	struct clock_event_device clkevt;
>>> +	struct regmap *regmap;
>>> +	void __iomem *base;
>>> +	struct clk *clk[2];
>>> +	char name[20];
>>
>> You can reasonably remove this field and use directly the ones in the
>> clocksrc/evt.
>>
> 
> name in struct clocksource is a pointer to a string, we still need a
> place to store that string.

Come on!

char *name = kasprintf(...);

tc.clkevt.name = name;
tc.clksrc.name = name;

no need to add a specific field for this.

Alternatively, you can make use of the
kbasename(node->parent->full_name) only without the channel numbering.

>>> +	int channels[2];
>>> +	int bits;
>>> +	int irq;
>>
>> After removing the request_irq/free_irq calls below (see comment), this
>> field can be removed.
>>
>>> +	struct {
>>> +		u32 cmr;
>>> +		u32 imr;
>>> +		u32 rc;
>>> +		bool clken;
>>
>> Not sure clken is needed, 16/32 is enough information.
>>
> 
> This as nothing to do with 16/32. We always need to now whether the
> timer was running or not.

Mmh, not sure why I said that. Anyway ...

>>> +	} cache[2];
>>> +	u32 bmr_cache;
>>
>> Are you sure you should save the bmr content ?
>>
> 
> We need to restore at least part of it. We may need to be more clever
> about it but this is the current behaviour and it has been working fine.
> 
>>> +	bool registered;
>>> +	bool clk_enabled;
>>
>> Not used.
>>
> 
> I guess they are use in the following patch.

Move them to the patch making use of it.

>>> +};
>>> +
>>> +static struct atmel_tcb_clksrc tc;
>>> +
>>> +static struct clk *tcb_clk_get(struct device_node *node, int channel)
>>> +{
>>> +	struct clk *clk;
>>> +	char clk_name[] = "t0_clk";
>>> +
>>> +	clk_name[1] += channel;
>>> +	clk = of_clk_get_by_name(node->parent, clk_name);
>>> +	if (!IS_ERR(clk))
>>> +		return clk;
>>> +
>>> +	return of_clk_get_by_name(node->parent, "t0_clk");
>>
>> Can you explain why returning "t0_clk" is better than returning an error?
>>
> 
> This is the current tclib behavior and doing otherwise would break the
> DT ABI.
> The reason for this behavior is that some TCB may have a clock
> per channel while others have one clock for the whole block.

What are the DT ABI? Can you point the snippets ?


>>> +}
>>> +
>>> +/*
>>> + * Clocksource and clockevent using the same channel(s)
>>> + */
>>> +static u64 tc_get_cycles(struct clocksource *cs)
>>> +{
>>> +	u32 lower, upper;
>>> +
>>> +	do {
>>> +		upper = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1]));
>>> +		lower = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
>>> +	} while (upper != readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1])));
>>> +
>>> +	return (upper << 16) | lower;
>>> +}
>>> +
>>> +static u64 tc_get_cycles32(struct clocksource *cs)
>>> +{
>>> +	return readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
>>> +}
>>> +
>>> +static u64 notrace tc_sched_clock_read(void)
>>> +{
>>> +	return tc_get_cycles(&tc.clksrc);
>>> +}
>>> +
>>> +static u64 notrace tc_sched_clock_read32(void)
>>> +{
>>> +	return tc_get_cycles32(&tc.clksrc);
>>> +}
>>> +
>>> +static int tcb_clkevt_next_event(unsigned long delta,
>>> +				 struct clock_event_device *d)
>>> +{
>>> +	u32 old, next, cur;
>>> +
>>> +	old = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
>>> +	next = old + delta;
>>> +	writel(next, tc.base + ATMEL_TC_RC(tc.channels[0]));
>>> +	cur = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
>>> +
>>> +	/* check whether the delta elapsed while setting the register */
>>> +	if ((next < old && cur < old && cur > next) ||
>>> +	    (next > old && (cur < old || cur > next))) {
>>> +		/*
>>> +		 * Clear the CPCS bit in the status register to avoid
>>> +		 * generating a spurious interrupt next time a valid
>>> +		 * timer event is configured.
>>> +		 */
>>> +		old = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
>>> +		return -ETIME;
>>> +	}
>>>> +	writel(ATMEL_TC_CPCS, tc.base + ATMEL_TC_IER(tc.channels[0]));
>>
>>
>> How this is compatible with 16bits as defined in the init function ?
>>
> 
> This is working fine because it is the lower bits channel and in that
> case, clockevents_config_and_register is call with the proper mask (16
> lower bits sets).
> 
>>> +	return 0;
>>> +}
>>> +
>>> +static irqreturn_t tc_clkevt_irq(int irq, void *handle)
>>> +{
>>> +	unsigned int sr;
>>> +
>>> +	sr = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
>>> +	if (sr & ATMEL_TC_CPCS) {
>>> +		tc.clkevt.event_handler(&tc.clkevt);
>>> +		return IRQ_HANDLED;
>>> +	}
>>> +
>>> +	return IRQ_NONE;
>>> +}
>>> +
>>> +static int tcb_clkevt_oneshot(struct clock_event_device *dev)
>>> +{
>>> +	if (clockevent_state_oneshot(dev))
>>> +		return 0;
>>> +
>>> +	/*
>>> +	 * Because both clockevent devices may share the same IRQ, we don't want
>>> +	 * the less likely one to stay requested
>>> +	 */
>>> +	return request_irq(tc.irq, tc_clkevt_irq, IRQF_TIMER | IRQF_SHARED,
>>> +			   tc.name, &tc);
>>> +}
>>> +
>>> +static int tcb_clkevt_shutdown(struct clock_event_device *dev)
>>> +{
>>> +	writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[0]));
>>> +	if (tc.bits == 16)
>>> +		writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[1]));
>>> +
>>> +	if (!clockevent_state_detached(dev))
>>> +		free_irq(tc.irq, &tc);
>>
>> Why are you requesting and freeing the irq instead of using the
>> disable/enable register operations ?
> 
> To avoid going through two interrupt handlers when we know that one is
> never used (that is when we have a separate channel for the clockevent,
> see following patch).

This explanation is not convincing. I will let you look at the
request_irq / free_irq internals (including __setup_irq) to figure out
why this is not possible.


>>> +	/* How fast will we be counting?  Pick something over 5 MHz.  */
>>> +	rate = (u32)clk_get_rate(tc.clk[0]);
>>> +	for (i = 0; i < 5; i++) {
>>> +		unsigned int divisor = atmel_tc_divisors[i];
>>> +		unsigned int tmp;
>>> +
>>> +		if (!divisor)
>>> +			continue;
>>
>> I suppose you meant here 'break' ? Use atmel_tc_divisors[] = { 2, 8, 32,
>> 128 }; And then the ARRAY_SIZE macro.
>>
>>> +		tmp = rate / divisor;
>>> +		pr_debug("TC: %u / %-3u [%d] --> %u\n", rate, divisor, i, tmp);
>>> +		if (best_divisor_idx > 0) {
>>> +			if (tmp < 5 * 1000 * 1000)
>>> +				continue;
>>> +		}
>>> +		divided_rate = tmp;
>>> +		best_divisor_idx = i;
>>
>> What is a best divisor ? The highest one or the one closer to 5MHz ?
>>
> 
> The whole divisor selection is coming for the previous driver and I'd
> rather not change it at this point, this is the topic of an other
> series.
> It chooses the first divisor that gives a counting rate over 5MHz

So why not stop the loop as soon as the rate / divisor is >= to 5MHz ?

>>> +	}
>>> +
>>> +	if (tc.bits == 32) {
>>> +		tc.clksrc.read = tc_get_cycles32;
>>> +		tcb_setup_single_chan(&tc, best_divisor_idx);
>>> +		tc_sched_clock = tc_sched_clock_read32;
>>> +		snprintf(tc.name, sizeof(tc.name), "%s:%d",
>>> +			 kbasename(node->parent->full_name), tc.channels[0]);
>>> +	} else {
>>> +		tc.clk[1] = tcb_clk_get(node, tc.channels[1]);
>>> +		if (IS_ERR(tc.clk[1]))
>>> +			goto err_disable_t0;
>>
>> This is very confusing. If the function tcb_clk_get() fails with this
>> channel, it will return "t0_clk" and will be used here ? Why ?
>>
> 
> See earlier explanation.
> 
>>> +		err = clk_prepare_enable(tc.clk[1]);
>>> +		if (err) {
>>> +			pr_debug("can't enable T1 clk\n");
>>> +			goto err_clk1;
>>> +		}
>>> +		tc.clksrc.read = tc_get_cycles,
>>> +		tcb_setup_dual_chan(&tc, best_divisor_idx);
>>> +		tc_sched_clock = tc_sched_clock_read;
>>> +		snprintf(tc.name, sizeof(tc.name), "%s:%d,%d",
>>> +			 kbasename(node->parent->full_name), tc.channels[0],
>>> +			 tc.channels[1]);
>>> +	}
>>> +
>>> +	pr_debug("%s at %d.%03d MHz\n", tc.name,
>>> +		 divided_rate / 1000000,
>>> +		 ((divided_rate + 500000) % 1000000) / 1000);
>>
>> Using two channels to emulate a 32bits timer has a significant cost,
>> especially in the sched_clock function which is part of the hot kernel
>> path. In addition it makes the code less maintainable and readable.
>>
>> Why don't you just stick to a specific rate with the prescalar value and
>> reduce the rating of the timer ? (example in the stm32 timer,
>> stm32_timer_set_prescaler and init function).
>>
>> It will be less precise (thus the lower rating) but will make the system
>> faster by preventing multiple register reads in the sched_clock.
>>
>> Is it an acceptable trade-off ?
>>
> 
> Not at this point, the goal is to not change the current behaviour.
> Some customer rely on the fast timer (they are bitbanging some RF
> protocols) and counting at more that 5MHz using a 16 bit timer is
> definitively too fast.

Not if you use the prescalar.

> This is something that could be changed once we implement timer rate
> selection (but I doubt it will make the code more readable).
> 
> I'm not saying we shouldn't question what was done 10 years ago but I'd
> rather not change it is this series.
> 
> Also, the goal is to get rid of the tcb_clksrc driver now that avr32 is
> gone. This will be done once the pwm driver is converted (I did that in
> v1).

You want to get rid of the tcb_clksrc by adding a new driver which is
very similar without taking into consideration to do a move to something
cleaner and putting in question what was already done.



>>> +	tcb_base = of_iomap(node->parent, 0);
>>> +	if (!tcb_base) {
>>> +		pr_err("%s +%d %s\n", __FILE__, __LINE__, __func__);
>>
>> Remove those debug information and replace them by a proper error message.
>>
> 
> My mistake, this will be simply removed.
> 
>>> +		return -ENXIO;
>>> +	}
>>> +
>>> +	match = of_match_node(atmel_tcb_dt_ids, node->parent);
>>> +	bits = (uintptr_t)match->data;
>>> +
>>> +	err = of_property_read_u32_index(node, "reg", 0, &channel);
>>> +	if (err)
>>> +		return err;
>>> +
>>> +	irq = of_irq_get(node->parent, channel);
>>> +	if (irq < 0) {
>>
>> if (irq <= 0) {
>>
>>> +		irq = of_irq_get(node->parent, 0);
>>
>> Why ?
>>
> 
> See the binding, 

Ok, can you point me to the code ?

> the timer is a child of the TCB and the TCB node has
> the irq info. So, the TCB is defined in the dtsi and the child nodes are
> in the board dts.
> 
>>> +		if (irq < 0)
>>
>> if (irq <= 0) {
>>
>>> +			return irq;
>>> +	}
>>> +
>>> +	if (bits == 16) {
>>> +		of_property_read_u32_index(node, "reg", 1, &chan1);
>>> +		if (chan1 == -1) {
>>> +			pr_err("%s: clocksource needs two channels\n",
>>> +			       node->parent->full_name);
>>
>> Think about it. The code is giving up at this point in the boot process.
>> So of two things, you consider there is an alternate clocksource /
>> clockevent or the system hangs:
>>
>>  - If there is an alternate clocksource why support 32bits by chaining
>> the channels with the cost it introduces instead of using the alternate
>> one ?
>>
> 
> The PIT is almost always the worse clocksource as it is very slow.

What is slow here ?

>>  - If there is no alternate clocksource why not support a 16bits less
>> precise timer and give up with the 32bits emulation and the complexity
>> it introduces in this driver ?
>>
> 
> If there is not alternate clocksource, the TCB is 32bit.
Alexandre Belloni Oct. 3, 2018, 10:26 p.m. UTC | #4
On 01/10/2018 23:24:11+0200, Daniel Lezcano wrote:
> On 25/09/2018 23:16, Alexandre Belloni wrote:
> > On 24/09/2018 03:59:55+0200, Daniel Lezcano wrote:
> >> On 13/09/2018 13:30, Alexandre Belloni wrote:
> >>> Add a driver for the Atmel Timer Counter Blocks. This driver provides a
> >>> clocksource and two clockevent devices.
> >>>
> >>> One of the clockevent device is linked to the clocksource counter and so it
> >>> will run at the same frequency. This will be used when there is only on TCB
> >>> channel available for timers.
> >>>
> >>> The other clockevent device runs on a separate TCB channel when available.
> >>>
> >>> This driver uses regmap and syscon to be able to probe early in the boot
> >>> and avoid having to switch on the TCB clocksource later. 
> >>
> >> Sorry, I don't get it. Can you elaborate?
> >>
> > 
> > The current existing way of sharing TCB channels is getting probed to
> > late in the boot process to be used as the clocksource so currently, the
> > PIT is necessary to act as the clocksource until the TCB clocksource can
> > be probed.
> > 
> > This is a big issue for SoCs without a PIT, they simply can't boot.
> 
> I'm still missing the point. The timer (clocksource + clocksource) is
> probed very early with TIMER_OF_DECLARE.
> 

No, tcb_clksrc is probed very latebecause it needs tclib to be probed
first and this happens at arch_initcall.
> 
> > This also solves:
> > 33d8c15559df Revert "clocksource/drivers/tcb_clksrc: Use 32 bit tcb as sched_clock"
> > 7b9f1d16e6d1 clocksource/drivers/tcb_clksrc: Use 32 bit tcb as sched_clock
> > 
> > 
> >>> Using regmap also
> >>> means that unused TCB channels may be used by other drivers (PWM for
> >>> example). read/writel are still used to access channel specific registers
> >>> to avoid the performance impact of regmap (mainly locking).
> >>
> >> I don't get the regmap reasoning here.
> > 
> > Because there are 3 channels per TCB, some TCB can have channels handled
> > by different drivers (say channel 0 for clocksource, channel 1 for
> > clockevent and channel 2 for PWM). There are configuration registers that
> > are shared for all the channels and so the access needs to be handled
> > properly. But as we discussed on a previous version of the patch, we
> > don't want to lock/unlock each time we read the clocksource so for the
> > channel specific registers, readl/writel is used directly.
> 
> Can you point me to the code where we have racy access to the
> ATMEL_TC_BMR register ?
> 

There is non currently.

> 
> >> My main concern with this driver is the 16bits chained support. See
> >> below in the comments.
> >>
> >>
> >>> +struct atmel_tcb_clksrc {
> >>> +	struct clocksource clksrc;
> >>> +	struct clock_event_device clkevt;
> >>> +	struct regmap *regmap;
> >>> +	void __iomem *base;
> >>> +	struct clk *clk[2];
> >>> +	char name[20];
> >>
> >> You can reasonably remove this field and use directly the ones in the
> >> clocksrc/evt.
> >>
> > 
> > name in struct clocksource is a pointer to a string, we still need a
> > place to store that string.
> 
> Come on!
> 
> char *name = kasprintf(...);
> 
> tc.clkevt.name = name;
> tc.clksrc.name = name;
> 
> no need to add a specific field for this.
> 

And how is unconditionally dynamically allocating memory better than
having a field in the struct?

> >> Can you explain why returning "t0_clk" is better than returning an error?
> >>
> > 
> > This is the current tclib behavior and doing otherwise would break the
> > DT ABI.
> > The reason for this behavior is that some TCB may have a clock
> > per channel while others have one clock for the whole block.
> 
> What are the DT ABI? Can you point the snippets ?
> 

It is documented in Documentation/devicetree/bindings/mfd/atmel-tcb.txt

> >> Using two channels to emulate a 32bits timer has a significant cost,
> >> especially in the sched_clock function which is part of the hot kernel
> >> path. In addition it makes the code less maintainable and readable.
> >>
> >> Why don't you just stick to a specific rate with the prescalar value and
> >> reduce the rating of the timer ? (example in the stm32 timer,
> >> stm32_timer_set_prescaler and init function).
> >>
> >> It will be less precise (thus the lower rating) but will make the system
> >> faster by preventing multiple register reads in the sched_clock.
> >>
> >> Is it an acceptable trade-off ?
> >>
> > 
> > Not at this point, the goal is to not change the current behaviour.
> > Some customer rely on the fast timer (they are bitbanging some RF
> > protocols) and counting at more that 5MHz using a 16 bit timer is
> > definitively too fast.
> 
> Not if you use the prescalar.
> 

I'm not sure what you mean. 16bits at 5MHz (and it is actually faster than that)
wraps up every 13ms.

> > This is something that could be changed once we implement timer rate
> > selection (but I doubt it will make the code more readable).
> > 
> > I'm not saying we shouldn't question what was done 10 years ago but I'd
> > rather not change it is this series.
> > 
> > Also, the goal is to get rid of the tcb_clksrc driver now that avr32 is
> > gone. This will be done once the pwm driver is converted (I did that in
> > v1).
> 
> You want to get rid of the tcb_clksrc by adding a new driver which is
> very similar without taking into consideration to do a move to something
> cleaner and putting in question what was already done.
> 
> 
> 
> >>> +	tcb_base = of_iomap(node->parent, 0);
> >>> +	if (!tcb_base) {
> >>> +		pr_err("%s +%d %s\n", __FILE__, __LINE__, __func__);
> >>
> >> Remove those debug information and replace them by a proper error message.
> >>
> > 
> > My mistake, this will be simply removed.
> > 
> >>> +		return -ENXIO;
> >>> +	}
> >>> +
> >>> +	match = of_match_node(atmel_tcb_dt_ids, node->parent);
> >>> +	bits = (uintptr_t)match->data;
> >>> +
> >>> +	err = of_property_read_u32_index(node, "reg", 0, &channel);
> >>> +	if (err)
> >>> +		return err;
> >>> +
> >>> +	irq = of_irq_get(node->parent, channel);
> >>> +	if (irq < 0) {
> >>
> >> if (irq <= 0) {
> >>
> >>> +		irq = of_irq_get(node->parent, 0);
> >>
> >> Why ?
> >>
> > 
> > See the binding, 
> 
> Ok, can you point me to the code ?
> 
> > the timer is a child of the TCB and the TCB node has
> > the irq info. So, the TCB is defined in the dtsi and the child nodes are
> > in the board dts.
> > 
> >>> +		if (irq < 0)
> >>
> >> if (irq <= 0) {
> >>
> >>> +			return irq;
> >>> +	}
> >>> +
> >>> +	if (bits == 16) {
> >>> +		of_property_read_u32_index(node, "reg", 1, &chan1);
> >>> +		if (chan1 == -1) {
> >>> +			pr_err("%s: clocksource needs two channels\n",
> >>> +			       node->parent->full_name);
> >>
> >> Think about it. The code is giving up at this point in the boot process.
> >> So of two things, you consider there is an alternate clocksource /
> >> clockevent or the system hangs:
> >>
> >>  - If there is an alternate clocksource why support 32bits by chaining
> >> the channels with the cost it introduces instead of using the alternate
> >> one ?
> >>
> > 
> > The PIT is almost always the worse clocksource as it is very slow.
> 
> What is slow here ?
> 

Well, my wording was very poor. The main issue with the pit is the
resolution of the clockevent (95 to 160 ms). Also, it wrap every 10
seconds or so.
diff mbox series

Patch

diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index a11f4ba98b05..8c7324e409f6 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -403,6 +403,14 @@  config ATMEL_ST
 	help
 	  Support for the Atmel ST timer.
 
+config ATMEL_ARM_TCB_CLKSRC
+	bool "Microchip ARM TC Block" if COMPILE_TEST
+	select REGMAP_MMIO
+	depends on GENERIC_CLOCKEVENTS
+	help
+	  This enables build of clocksource and clockevent driver for
+	  the integrated Timer Counter Blocks in Microchip ARM SoCs.
+
 config CLKSRC_EXYNOS_MCT
 	bool "Exynos multi core timer driver" if COMPILE_TEST
 	depends on ARM || ARM64
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index db51b2427e8a..0df9384a1230 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -3,7 +3,8 @@  obj-$(CONFIG_TIMER_OF)		+= timer-of.o
 obj-$(CONFIG_TIMER_PROBE)	+= timer-probe.o
 obj-$(CONFIG_ATMEL_PIT)		+= timer-atmel-pit.o
 obj-$(CONFIG_ATMEL_ST)		+= timer-atmel-st.o
-obj-$(CONFIG_ATMEL_TCB_CLKSRC)	+= tcb_clksrc.o
+obj-$(CONFIG_ATMEL_TCB_CLKSRC) += tcb_clksrc.o
+obj-$(CONFIG_ATMEL_ARM_TCB_CLKSRC)	+= timer-atmel-tcb.o
 obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
 obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
 obj-$(CONFIG_CS5535_CLOCK_EVENT_SRC)	+= cs5535-clockevt.o
diff --git a/drivers/clocksource/timer-atmel-tcb.c b/drivers/clocksource/timer-atmel-tcb.c
new file mode 100644
index 000000000000..21fbe430f91b
--- /dev/null
+++ b/drivers/clocksource/timer-atmel-tcb.c
@@ -0,0 +1,410 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/clk.h>
+#include <linux/clockchips.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mfd/syscon.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/regmap.h>
+#include <linux/sched_clock.h>
+#include <soc/at91/atmel_tcb.h>
+
+struct atmel_tcb_clksrc {
+	struct clocksource clksrc;
+	struct clock_event_device clkevt;
+	struct regmap *regmap;
+	void __iomem *base;
+	struct clk *clk[2];
+	char name[20];
+	int channels[2];
+	int bits;
+	int irq;
+	struct {
+		u32 cmr;
+		u32 imr;
+		u32 rc;
+		bool clken;
+	} cache[2];
+	u32 bmr_cache;
+	bool registered;
+	bool clk_enabled;
+};
+
+static struct atmel_tcb_clksrc tc;
+
+static struct clk *tcb_clk_get(struct device_node *node, int channel)
+{
+	struct clk *clk;
+	char clk_name[] = "t0_clk";
+
+	clk_name[1] += channel;
+	clk = of_clk_get_by_name(node->parent, clk_name);
+	if (!IS_ERR(clk))
+		return clk;
+
+	return of_clk_get_by_name(node->parent, "t0_clk");
+}
+
+/*
+ * Clocksource and clockevent using the same channel(s)
+ */
+static u64 tc_get_cycles(struct clocksource *cs)
+{
+	u32 lower, upper;
+
+	do {
+		upper = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1]));
+		lower = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
+	} while (upper != readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1])));
+
+	return (upper << 16) | lower;
+}
+
+static u64 tc_get_cycles32(struct clocksource *cs)
+{
+	return readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
+}
+
+static u64 notrace tc_sched_clock_read(void)
+{
+	return tc_get_cycles(&tc.clksrc);
+}
+
+static u64 notrace tc_sched_clock_read32(void)
+{
+	return tc_get_cycles32(&tc.clksrc);
+}
+
+static int tcb_clkevt_next_event(unsigned long delta,
+				 struct clock_event_device *d)
+{
+	u32 old, next, cur;
+
+	old = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
+	next = old + delta;
+	writel(next, tc.base + ATMEL_TC_RC(tc.channels[0]));
+	cur = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
+
+	/* check whether the delta elapsed while setting the register */
+	if ((next < old && cur < old && cur > next) ||
+	    (next > old && (cur < old || cur > next))) {
+		/*
+		 * Clear the CPCS bit in the status register to avoid
+		 * generating a spurious interrupt next time a valid
+		 * timer event is configured.
+		 */
+		old = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
+		return -ETIME;
+	}
+
+	writel(ATMEL_TC_CPCS, tc.base + ATMEL_TC_IER(tc.channels[0]));
+
+	return 0;
+}
+
+static irqreturn_t tc_clkevt_irq(int irq, void *handle)
+{
+	unsigned int sr;
+
+	sr = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
+	if (sr & ATMEL_TC_CPCS) {
+		tc.clkevt.event_handler(&tc.clkevt);
+		return IRQ_HANDLED;
+	}
+
+	return IRQ_NONE;
+}
+
+static int tcb_clkevt_oneshot(struct clock_event_device *dev)
+{
+	if (clockevent_state_oneshot(dev))
+		return 0;
+
+	/*
+	 * Because both clockevent devices may share the same IRQ, we don't want
+	 * the less likely one to stay requested
+	 */
+	return request_irq(tc.irq, tc_clkevt_irq, IRQF_TIMER | IRQF_SHARED,
+			   tc.name, &tc);
+}
+
+static int tcb_clkevt_shutdown(struct clock_event_device *dev)
+{
+	writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[0]));
+	if (tc.bits == 16)
+		writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[1]));
+
+	if (!clockevent_state_detached(dev))
+		free_irq(tc.irq, &tc);
+
+	return 0;
+}
+
+static void __init tcb_setup_dual_chan(struct atmel_tcb_clksrc *tc,
+				       int mck_divisor_idx)
+{
+	/* first channel: waveform mode, input mclk/8, clock TIOA on overflow */
+	writel(mck_divisor_idx			/* likely divide-by-8 */
+	       | ATMEL_TC_CMR_WAVE
+	       | ATMEL_TC_CMR_WAVESEL_UP	/* free-run */
+	       | ATMEL_TC_CMR_ACPA(SET)		/* TIOA rises at 0 */
+	       | ATMEL_TC_CMR_ACPC(CLEAR),	/* (duty cycle 50%) */
+	       tc->base + ATMEL_TC_CMR(tc->channels[0]));
+	writel(0x0000, tc->base + ATMEL_TC_RA(tc->channels[0]));
+	writel(0x8000, tc->base + ATMEL_TC_RC(tc->channels[0]));
+	writel(0xff, tc->base + ATMEL_TC_IDR(tc->channels[0]));	/* no irqs */
+	writel(ATMEL_TC_CCR_CLKEN, tc->base + ATMEL_TC_CCR(tc->channels[0]));
+
+	/* second channel: waveform mode, input TIOA */
+	writel(ATMEL_TC_CMR_XC(tc->channels[1])		/* input: TIOA */
+	       | ATMEL_TC_CMR_WAVE
+	       | ATMEL_TC_CMR_WAVESEL_UP,		/* free-run */
+	       tc->base + ATMEL_TC_CMR(tc->channels[1]));
+	writel(0xff, tc->base + ATMEL_TC_IDR(tc->channels[1]));	/* no irqs */
+	writel(ATMEL_TC_CCR_CLKEN, tc->base + ATMEL_TC_CCR(tc->channels[1]));
+
+	/* chain both channel, we assume the previous channel */
+	regmap_write(tc->regmap, ATMEL_TC_BMR,
+		     ATMEL_TC_BMR_TCXC(1 + tc->channels[1], tc->channels[1]));
+	/* then reset all the timers */
+	regmap_write(tc->regmap, ATMEL_TC_BCR, ATMEL_TC_BCR_SYNC);
+}
+
+static void __init tcb_setup_single_chan(struct atmel_tcb_clksrc *tc,
+					 int mck_divisor_idx)
+{
+	/* channel 0:  waveform mode, input mclk/8 */
+	writel(mck_divisor_idx			/* likely divide-by-8 */
+	       | ATMEL_TC_CMR_WAVE
+	       | ATMEL_TC_CMR_WAVESEL_UP,	/* free-run */
+	       tc->base + ATMEL_TC_CMR(tc->channels[0]));
+	writel(0xff, tc->base + ATMEL_TC_IDR(tc->channels[0]));	/* no irqs */
+	writel(ATMEL_TC_CCR_CLKEN, tc->base + ATMEL_TC_CCR(tc->channels[0]));
+
+	/* then reset all the timers */
+	regmap_write(tc->regmap, ATMEL_TC_BCR, ATMEL_TC_BCR_SYNC);
+}
+
+static void tc_clksrc_suspend(struct clocksource *cs)
+{
+	int i;
+
+	for (i = 0; i < 1 + (tc.bits == 16); i++) {
+		tc.cache[i].cmr = readl(tc.base + ATMEL_TC_CMR(tc.channels[i]));
+		tc.cache[i].imr = readl(tc.base + ATMEL_TC_IMR(tc.channels[i]));
+		tc.cache[i].rc = readl(tc.base + ATMEL_TC_RC(tc.channels[i]));
+		tc.cache[i].clken = !!(readl(tc.base +
+					     ATMEL_TC_SR(tc.channels[i])) &
+				       ATMEL_TC_CLKSTA);
+	}
+
+	if (tc.bits == 16)
+		regmap_read(tc.regmap, ATMEL_TC_BMR, &tc.bmr_cache);
+}
+
+static void tc_clksrc_resume(struct clocksource *cs)
+{
+	int i;
+
+	for (i = 0; i < 1 + (tc.bits == 16); i++) {
+		/* Restore registers for the channel, RA and RB are not used  */
+		writel(tc.cache[i].cmr, tc.base + ATMEL_TC_CMR(tc.channels[i]));
+		writel(tc.cache[i].rc, tc.base + ATMEL_TC_RC(tc.channels[i]));
+		writel(0, tc.base + ATMEL_TC_RA(tc.channels[i]));
+		writel(0, tc.base + ATMEL_TC_RB(tc.channels[i]));
+		/* Disable all the interrupts */
+		writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[i]));
+		/* Reenable interrupts that were enabled before suspending */
+		writel(tc.cache[i].imr, tc.base + ATMEL_TC_IER(tc.channels[i]));
+
+		/* Start the clock if it was used */
+		if (tc.cache[i].clken)
+			writel(ATMEL_TC_CCR_CLKEN, tc.base +
+			       ATMEL_TC_CCR(tc.channels[i]));
+	}
+
+	/* in case of dual channel, chain channels */
+	if (tc.bits == 16)
+		regmap_write(tc.regmap, ATMEL_TC_BMR, tc.bmr_cache);
+	/* Finally, trigger all the channels*/
+	regmap_write(tc.regmap, ATMEL_TC_BCR, ATMEL_TC_BCR_SYNC);
+}
+
+static int __init tcb_clksrc_register(struct device_node *node,
+				      struct regmap *regmap, void __iomem *base,
+				      int channel, int channel1, int irq,
+				      int bits)
+{
+	u32 rate, divided_rate = 0;
+	int best_divisor_idx = -1;
+	int i, err = -1;
+	u64 (*tc_sched_clock)(void);
+
+	tc.regmap = regmap;
+	tc.base = base;
+	tc.channels[0] = channel;
+	tc.channels[1] = channel1;
+	tc.irq = irq;
+	tc.bits = bits;
+
+	tc.clk[0] = tcb_clk_get(node, tc.channels[0]);
+	if (IS_ERR(tc.clk[0]))
+		return PTR_ERR(tc.clk[0]);
+	err = clk_prepare_enable(tc.clk[0]);
+	if (err) {
+		pr_debug("can't enable T0 clk\n");
+		goto err_clk;
+	}
+
+	/* How fast will we be counting?  Pick something over 5 MHz.  */
+	rate = (u32)clk_get_rate(tc.clk[0]);
+	for (i = 0; i < 5; i++) {
+		unsigned int divisor = atmel_tc_divisors[i];
+		unsigned int tmp;
+
+		if (!divisor)
+			continue;
+
+		tmp = rate / divisor;
+		pr_debug("TC: %u / %-3u [%d] --> %u\n", rate, divisor, i, tmp);
+		if (best_divisor_idx > 0) {
+			if (tmp < 5 * 1000 * 1000)
+				continue;
+		}
+		divided_rate = tmp;
+		best_divisor_idx = i;
+	}
+
+	if (tc.bits == 32) {
+		tc.clksrc.read = tc_get_cycles32;
+		tcb_setup_single_chan(&tc, best_divisor_idx);
+		tc_sched_clock = tc_sched_clock_read32;
+		snprintf(tc.name, sizeof(tc.name), "%s:%d",
+			 kbasename(node->parent->full_name), tc.channels[0]);
+	} else {
+		tc.clk[1] = tcb_clk_get(node, tc.channels[1]);
+		if (IS_ERR(tc.clk[1]))
+			goto err_disable_t0;
+
+		err = clk_prepare_enable(tc.clk[1]);
+		if (err) {
+			pr_debug("can't enable T1 clk\n");
+			goto err_clk1;
+		}
+		tc.clksrc.read = tc_get_cycles,
+		tcb_setup_dual_chan(&tc, best_divisor_idx);
+		tc_sched_clock = tc_sched_clock_read;
+		snprintf(tc.name, sizeof(tc.name), "%s:%d,%d",
+			 kbasename(node->parent->full_name), tc.channels[0],
+			 tc.channels[1]);
+	}
+
+	pr_debug("%s at %d.%03d MHz\n", tc.name,
+		 divided_rate / 1000000,
+		 ((divided_rate + 500000) % 1000000) / 1000);
+
+	tc.clksrc.name = tc.name;
+	tc.clksrc.suspend = tc_clksrc_suspend;
+	tc.clksrc.resume = tc_clksrc_resume;
+	tc.clksrc.rating = 200;
+	tc.clksrc.mask = CLOCKSOURCE_MASK(32);
+	tc.clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
+
+	err = clocksource_register_hz(&tc.clksrc, divided_rate);
+	if (err)
+		goto err_disable_t1;
+
+	sched_clock_register(tc_sched_clock, 32, divided_rate);
+
+	tc.registered = true;
+
+	/* Set up and register clockevents */
+	tc.clkevt.name = tc.name;
+	tc.clkevt.cpumask = cpumask_of(0);
+	tc.clkevt.set_next_event = tcb_clkevt_next_event;
+	tc.clkevt.set_state_oneshot = tcb_clkevt_oneshot;
+	tc.clkevt.set_state_shutdown = tcb_clkevt_shutdown;
+	tc.clkevt.features = CLOCK_EVT_FEAT_ONESHOT;
+	tc.clkevt.rating = 125;
+
+	clockevents_config_and_register(&tc.clkevt, divided_rate, 1,
+					BIT(tc.bits) - 1);
+
+	return 0;
+
+err_disable_t1:
+	if (tc.bits == 16)
+		clk_disable_unprepare(tc.clk[1]);
+
+err_clk1:
+	if (tc.bits == 16)
+		clk_put(tc.clk[1]);
+
+err_disable_t0:
+	clk_disable_unprepare(tc.clk[0]);
+
+err_clk:
+	clk_put(tc.clk[0]);
+
+	pr_err("%s: unable to register clocksource/clockevent\n",
+	       tc.clksrc.name);
+
+	return err;
+}
+
+static int __init tcb_clksrc_init(struct device_node *node)
+{
+	const struct of_device_id *match;
+	struct regmap *regmap;
+	void __iomem *tcb_base;
+	u32 channel;
+	int irq, err, chan1 = -1;
+	unsigned bits;
+
+	if (tc.registered)
+		return -ENODEV;
+
+	/*
+	 * The regmap has to be used to access registers that are shared
+	 * between channels on the same TCB but we keep direct IO access for
+	 * the counters to avoid the impact on performance
+	 */
+	regmap = syscon_node_to_regmap(node->parent);
+	if (IS_ERR(regmap))
+		return PTR_ERR(regmap);
+
+	tcb_base = of_iomap(node->parent, 0);
+	if (!tcb_base) {
+		pr_err("%s +%d %s\n", __FILE__, __LINE__, __func__);
+		return -ENXIO;
+	}
+
+	match = of_match_node(atmel_tcb_dt_ids, node->parent);
+	bits = (uintptr_t)match->data;
+
+	err = of_property_read_u32_index(node, "reg", 0, &channel);
+	if (err)
+		return err;
+
+	irq = of_irq_get(node->parent, channel);
+	if (irq < 0) {
+		irq = of_irq_get(node->parent, 0);
+		if (irq < 0)
+			return irq;
+	}
+
+	if (bits == 16) {
+		of_property_read_u32_index(node, "reg", 1, &chan1);
+		if (chan1 == -1) {
+			pr_err("%s: clocksource needs two channels\n",
+			       node->parent->full_name);
+			return -EINVAL;
+		}
+	}
+
+	return tcb_clksrc_register(node, regmap, tcb_base, channel, chan1, irq,
+				   bits);
+}
+TIMER_OF_DECLARE(atmel_tcb_clksrc, "atmel,tcb-timer", tcb_clksrc_init);