diff mbox series

[RFT,v3,16/27] irqchip/apple-aic: Add support for the Apple Interrupt Controller

Message ID 20210304213902.83903-17-marcan@marcan.st (mailing list archive)
State New, archived
Headers show
Series Apple M1 SoC platform bring-up | expand

Commit Message

Hector Martin March 4, 2021, 9:38 p.m. UTC
This is the root interrupt controller used on Apple ARM SoCs such as the
M1. This irqchip driver performs multiple functions:

* Handles both IRQs and FIQs

* Drives the AIC peripheral itself (which handles IRQs)

* Dispatches FIQs to downstream hard-wired clients (currently the ARM
  timer).

* Implements a virtual IPI multiplexer to funnel multiple Linux IPIs
  into a single hardware IPI

Signed-off-by: Hector Martin <marcan@marcan.st>
---
 MAINTAINERS                     |   2 +
 drivers/irqchip/Kconfig         |   8 +
 drivers/irqchip/Makefile        |   1 +
 drivers/irqchip/irq-apple-aic.c | 710 ++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h      |   1 +
 5 files changed, 722 insertions(+)
 create mode 100644 drivers/irqchip/irq-apple-aic.c

Comments

Andy Shevchenko March 5, 2021, 3:05 p.m. UTC | #1
On Thu, Mar 4, 2021 at 11:41 PM Hector Martin <marcan@marcan.st> wrote:
>
> This is the root interrupt controller used on Apple ARM SoCs such as the
> M1. This irqchip driver performs multiple functions:
>
> * Handles both IRQs and FIQs
>
> * Drives the AIC peripheral itself (which handles IRQs)
>
> * Dispatches FIQs to downstream hard-wired clients (currently the ARM
>   timer).
>
> * Implements a virtual IPI multiplexer to funnel multiple Linux IPIs
>   into a single hardware IPI

...

> + *   - <0 nr flags> - hwirq #nr
> + *   - <1 nr flags> - FIQ #nr
> + *     - nr=0  Physical HV timer
> + *     - nr=1  Virtual HV timer
> + *     - nr=2  Physical guest timer
> + *     - nr=3  Virtual guest timer

> + *

Unneeded blank line.

> + */

...

> +#define pr_fmt(fmt) "%s: " fmt, __func__

This is not needed, really, if you have unique / distinguishable
messages in the first place.
Rather people include module names, which may be useful.

...

> +#define MASK_REG(x)            (4 * ((x) >> 5))
> +#define MASK_BIT(x)            BIT((x) & 0x1f)

GENMASK(4,0)

...

> +/*
> + * Max 31 bits in IPI SEND register (top bit is self).
> + * >=32-core chips will need code changes anyway.
> + */
> +#define AIC_MAX_CPUS           31

I would put it as (32 - 1) to show that the register is actually 32-bit long.

...

> +static atomic_t aic_vipi_flag[AIC_MAX_CPUS];
> +static atomic_t aic_vipi_enable[AIC_MAX_CPUS];

Isn't it easier to handle these when they are full width, i.e. 32
items per the array?

...

> +static int aic_irq_set_affinity(struct irq_data *d,
> +                               const struct cpumask *mask_val, bool force)
> +{
> +       irq_hw_number_t hwirq = irqd_to_hwirq(d);
> +       struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
> +       int cpu;
> +
> +       if (hwirq > ic->nr_hw)

>= ?

> +               return -EINVAL;
> +
> +       if (force)
> +               cpu = cpumask_first(mask_val);
> +       else
> +               cpu = cpumask_any_and(mask_val, cpu_online_mask);
> +
> +       aic_ic_write(ic, AIC_TARGET_CPU + hwirq * 4, BIT(cpu));
> +       irq_data_update_effective_affinity(d, cpumask_of(cpu));
> +
> +       return IRQ_SET_MASK_OK;
> +}

...

> +static void aic_fiq_mask(struct irq_data *d)
> +{
> +       /* Only the guest timers have real mask bits, unfortunately. */
> +       switch (d->hwirq) {
> +       case AIC_TMR_GUEST_PHYS:
> +               sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, VM_TMR_FIQ_ENABLE_P, 0);
> +               break;
> +       case AIC_TMR_GUEST_VIRT:
> +               sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, VM_TMR_FIQ_ENABLE_V, 0);
> +               break;

default case? // some compilers may not be happy
Ditto for all similar places in the series.

> +       }
> +}

...

> +#define TIMER_FIRING(x)                                                        \
> +       (((x) & (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_MASK |            \
> +                ARCH_TIMER_CTRL_IT_STAT)) ==                                  \
> +        (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_STAT))

It's a bit hard to read. Perhaps

#define FOO_MASK  (_ENABLE | _STAT)
#define _FIRING ... (FOO_MASK | _MASK == FOO_MASK)

?

...

> +       if ((read_sysreg_s(SYS_APL_PMCR0_EL1) & (PMCR0_IMODE | PMCR0_IACT))
> +                       == (FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_FIQ) | PMCR0_IACT)) {

It's better to have == on the previous line.

...

> +       for_each_set_bit(i, &firing, AIC_NR_SWIPI) {
> +               handle_domain_irq(aic_irqc->ipi_domain, i, regs);
> +       }

No {} needed.

...

> +static int aic_init_smp(struct aic_irq_chip *irqc, struct device_node *node)
> +{
> +       int base_ipi;

Introducing a temporary variable may help with readability

...  *d = irqc->hw_domain;

> +       irqc->ipi_domain = irq_domain_create_linear(irqc->hw_domain->fwnode, AIC_NR_SWIPI,
> +                                                   &aic_ipi_domain_ops, irqc);
> +       if (WARN_ON(!irqc->ipi_domain))
> +               return -ENODEV;
> +
> +       irqc->ipi_domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
> +       irq_domain_update_bus_token(irqc->ipi_domain, DOMAIN_BUS_IPI);
> +
> +       base_ipi = __irq_domain_alloc_irqs(irqc->ipi_domain, -1, AIC_NR_SWIPI,
> +                                          NUMA_NO_NODE, NULL, false, NULL);
> +
> +       if (WARN_ON(!base_ipi)) {
> +               irq_domain_remove(irqc->ipi_domain);
> +               return -ENODEV;
> +       }
> +
> +       set_smp_ipi_range(base_ipi, AIC_NR_SWIPI);
> +
> +       return 0;
> +}

...

> +       return 0;
> +

Extra blank line.

...

> +       irqc->hw_domain = irq_domain_create_linear(of_node_to_fwnode(node),
> +                                                  irqc->nr_hw + AIC_NR_FIQ,
> +                                                  &aic_irq_domain_ops, irqc);

If you are sure it will be always OF-only, why not to use
irq_domain_add_linear()?

...

> +       for (i = 0; i < BITS_TO_U32(irqc->nr_hw); i++)
> +               aic_ic_write(irqc, AIC_MASK_SET + i * 4, ~0);
> +       for (i = 0; i < BITS_TO_U32(irqc->nr_hw); i++)
> +               aic_ic_write(irqc, AIC_SW_CLR + i * 4, ~0);

~0 is a beast when it suddenly gets into > int size.

I would recommend using either GENMASK() if it's a bit field, or
type_MAX values if it's a plain number.
Marc Zyngier March 8, 2021, 11:50 a.m. UTC | #2
On Fri, 05 Mar 2021 15:05:08 +0000,
Andy Shevchenko <andy.shevchenko@gmail.com> wrote:

[...]

> > +#define TIMER_FIRING(x)                                                        \
> > +       (((x) & (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_MASK |            \
> > +                ARCH_TIMER_CTRL_IT_STAT)) ==                                  \
> > +        (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_STAT))
> 
> It's a bit hard to read. Perhaps
> 
> #define FOO_MASK  (_ENABLE | _STAT)
> #define _FIRING ... (FOO_MASK | _MASK == FOO_MASK)

The expression above is a direct translation of the architecture
reference manual, and I'd rather not have that hidden behind a bunch
of obscure macros.

[...]

> > +       irqc->hw_domain = irq_domain_create_linear(of_node_to_fwnode(node),
> > +                                                  irqc->nr_hw + AIC_NR_FIQ,
> > +                                                  &aic_irq_domain_ops, irqc);
> 
> If you are sure it will be always OF-only, why not to use
> irq_domain_add_linear()?

The OF-only API is deprecated, and there is no point in using it for
*new* code, specially when things like IPI allocation require the use
of the modern API. For arm64 root controllers, that's the way to go.

Thanks,

	M.
Andy Shevchenko March 8, 2021, 12:02 p.m. UTC | #3
On Mon, Mar 8, 2021 at 1:50 PM Marc Zyngier <maz@kernel.org> wrote:
> On Fri, 05 Mar 2021 15:05:08 +0000,
> Andy Shevchenko <andy.shevchenko@gmail.com> wrote:

...

> > > +#define TIMER_FIRING(x)                                                        \
> > > +       (((x) & (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_MASK |            \
> > > +                ARCH_TIMER_CTRL_IT_STAT)) ==                                  \
> > > +        (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_STAT))
> >
> > It's a bit hard to read. Perhaps
> >
> > #define FOO_MASK  (_ENABLE | _STAT)
> > #define _FIRING ... (FOO_MASK | _MASK == FOO_MASK)
>
> The expression above is a direct translation of the architecture
> reference manual, and I'd rather not have that hidden behind a bunch
> of obscure macros.

OK!

...

> > > +       irqc->hw_domain = irq_domain_create_linear(of_node_to_fwnode(node),
> > > +                                                  irqc->nr_hw + AIC_NR_FIQ,
> > > +                                                  &aic_irq_domain_ops, irqc);
> >
> > If you are sure it will be always OF-only, why not to use
> > irq_domain_add_linear()?
>
> The OF-only API is deprecated, and there is no point in using it for
> *new* code, specially when things like IPI allocation require the use
> of the modern API. For arm64 root controllers, that's the way to go.

Good to know, thanks!
Marc Zyngier March 8, 2021, 1:31 p.m. UTC | #4
On Thu, 04 Mar 2021 21:38:51 +0000,
Hector Martin <marcan@marcan.st> wrote:
> 
> This is the root interrupt controller used on Apple ARM SoCs such as the
> M1. This irqchip driver performs multiple functions:
> 
> * Handles both IRQs and FIQs
> 
> * Drives the AIC peripheral itself (which handles IRQs)
> 
> * Dispatches FIQs to downstream hard-wired clients (currently the ARM
>   timer).
> 
> * Implements a virtual IPI multiplexer to funnel multiple Linux IPIs
>   into a single hardware IPI
>

[...]

> Signed-off-by: Hector Martin <marcan@marcan.st>
> +static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs)
> +{
> +	struct aic_irq_chip *ic = aic_irqc;
> +	u32 event, type, irq;
> +
> +	do {
> +		/*
> +		 * We cannot use a relaxed read here, as DMA needs to be
> +		 * ordered with respect to the IRQ firing.
> +		 */
> +		event = readl(ic->base + AIC_EVENT);
> +		type = FIELD_GET(AIC_EVENT_TYPE, event);
> +		irq = FIELD_GET(AIC_EVENT_NUM, event);
> +
> +		if (type == AIC_EVENT_TYPE_HW)
> +			handle_domain_irq(aic_irqc->hw_domain, irq, regs);
> +		else if (type == AIC_EVENT_TYPE_IPI && irq == 1)
> +			aic_handle_ipi(regs);
> +		else if (event != 0)
> +			pr_err("Unknown IRQ event %d, %d\n", type, irq);
> +	} while (event);
> +
> +	/*
> +	 * vGIC maintenance interrupts end up here too, so we need to check
> +	 * for them separately. Just report and disable vGIC for now, until
> +	 * we implement this properly.
> +	 */
> +	if ((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EN) &&
> +		read_sysreg_s(SYS_ICH_MISR_EL2) != 0) {
> +		pr_err("vGIC IRQ fired, disabling.\n");

Please add a _ratelimited here. Whilst debugging KVM on this machine,
I ended up with this firing at such a rate that it was impossible to
do anything useful. Ratelimiting it allowed me to pinpoint the
problem.

[...]

> +/*
> + * FIQ irqchip
> + */
> +
> +static void aic_fiq_mask(struct irq_data *d)
> +{
> +	/* Only the guest timers have real mask bits, unfortunately. */
> +	switch (d->hwirq) {
> +	case AIC_TMR_GUEST_PHYS:
> +		sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, VM_TMR_FIQ_ENABLE_P, 0);
> +		break;
> +	case AIC_TMR_GUEST_VIRT:
> +		sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, VM_TMR_FIQ_ENABLE_V, 0);
> +		break;
> +	}
> +}
> +
> +static void aic_fiq_unmask(struct irq_data *d)
> +{
> +	switch (d->hwirq) {
> +	case AIC_TMR_GUEST_PHYS:
> +		sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, 0, VM_TMR_FIQ_ENABLE_P);
> +		break;
> +	case AIC_TMR_GUEST_VIRT:
> +		sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, 0, VM_TMR_FIQ_ENABLE_V);
> +		break;
> +	}
> +}
> +
> +static void aic_fiq_eoi(struct irq_data *d)
> +{
> +	/* We mask to ack (where we can), so we need to unmask at EOI. */
> +	if (!irqd_irq_disabled(d) && !irqd_irq_masked(d))

Ah, be careful here: irqd_irq_masked() doesn't do what you think it
does for per-CPU interrupts. It's been on my list to fix for the rVIC
implementation, but I never got around to doing it, and all decent ICs
hide this from SW by having a HW-managed mask, similar to what is on
the IRQ side.

I can see two possibilities:

- you can track the masked state directly and use that instead of
  these predicates

- you can just drop the masking altogether as this is only useful to a
  hosted hypervisor (KVM), which will have to do its own masking
  behind the scenes anyway

> +		aic_fiq_unmask(d);
> +}
> +

The rest looks good to me.

Thanks,

	M.
Will Deacon March 24, 2021, 7:57 p.m. UTC | #5
Hi Hector,

Sorry it took me so long to get to this. Some comments below.

On Fri, Mar 05, 2021 at 06:38:51AM +0900, Hector Martin wrote:
> This is the root interrupt controller used on Apple ARM SoCs such as the
> M1. This irqchip driver performs multiple functions:
> 
> * Handles both IRQs and FIQs
> 
> * Drives the AIC peripheral itself (which handles IRQs)
> 
> * Dispatches FIQs to downstream hard-wired clients (currently the ARM
>   timer).
> 
> * Implements a virtual IPI multiplexer to funnel multiple Linux IPIs
>   into a single hardware IPI
> 
> Signed-off-by: Hector Martin <marcan@marcan.st>
> ---
>  MAINTAINERS                     |   2 +
>  drivers/irqchip/Kconfig         |   8 +
>  drivers/irqchip/Makefile        |   1 +
>  drivers/irqchip/irq-apple-aic.c | 710 ++++++++++++++++++++++++++++++++
>  include/linux/cpuhotplug.h      |   1 +
>  5 files changed, 722 insertions(+)
>  create mode 100644 drivers/irqchip/irq-apple-aic.c

[...]

> + * Implementation notes:
> + *
> + * - This driver creates two IRQ domains, one for HW IRQs and internal FIQs,
> + *   and one for IPIs.
> + * - Since Linux needs more than 2 IPIs, we implement a software IRQ controller
> + *   and funnel all IPIs into one per-CPU IPI (the second "self" IPI is unused).
> + * - FIQ hwirq numbers are assigned after true hwirqs, and are per-cpu.
> + * - DT bindings use 3-cell form (like GIC):
> + *   - <0 nr flags> - hwirq #nr
> + *   - <1 nr flags> - FIQ #nr
> + *     - nr=0  Physical HV timer
> + *     - nr=1  Virtual HV timer
> + *     - nr=2  Physical guest timer
> + *     - nr=3  Virtual guest timer
> + *
> + */
> +
> +#define pr_fmt(fmt) "%s: " fmt, __func__

General nit: but I suspect many of the prints in here probably want to be
using the *_ratelimited variants.

> +static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs)
> +{
> +	struct aic_irq_chip *ic = aic_irqc;
> +	u32 event, type, irq;
> +
> +	do {
> +		/*
> +		 * We cannot use a relaxed read here, as DMA needs to be
> +		 * ordered with respect to the IRQ firing.
> +		 */

I think this could be a bit clearer: the readl() doesn't order any DMA
accesses, but instead means that subsequent reads by the CPU are ordered
(which may be from a buffer which was DMA'd to) are ordered after the
read of the MMIO register.

> +		event = readl(ic->base + AIC_EVENT);
> +		type = FIELD_GET(AIC_EVENT_TYPE, event);
> +		irq = FIELD_GET(AIC_EVENT_NUM, event);
> +
> +		if (type == AIC_EVENT_TYPE_HW)
> +			handle_domain_irq(aic_irqc->hw_domain, irq, regs);
> +		else if (type == AIC_EVENT_TYPE_IPI && irq == 1)
> +			aic_handle_ipi(regs);
> +		else if (event != 0)
> +			pr_err("Unknown IRQ event %d, %d\n", type, irq);
> +	} while (event);
> +
> +	/*
> +	 * vGIC maintenance interrupts end up here too, so we need to check
> +	 * for them separately. Just report and disable vGIC for now, until
> +	 * we implement this properly.
> +	 */
> +	if ((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EN) &&
> +		read_sysreg_s(SYS_ICH_MISR_EL2) != 0) {
> +		pr_err("vGIC IRQ fired, disabling.\n");
> +		sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EN, 0);
> +	}

What prevents all these system register accesses being speculated up before
the handler?

> +}
> +
> +static int aic_irq_set_affinity(struct irq_data *d,
> +				const struct cpumask *mask_val, bool force)
> +{
> +	irq_hw_number_t hwirq = irqd_to_hwirq(d);
> +	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
> +	int cpu;
> +
> +	if (hwirq > ic->nr_hw)
> +		return -EINVAL;
> +
> +	if (force)
> +		cpu = cpumask_first(mask_val);
> +	else
> +		cpu = cpumask_any_and(mask_val, cpu_online_mask);
> +
> +	aic_ic_write(ic, AIC_TARGET_CPU + hwirq * 4, BIT(cpu));
> +	irq_data_update_effective_affinity(d, cpumask_of(cpu));
> +
> +	return IRQ_SET_MASK_OK;
> +}
> +
> +static int aic_irq_set_type(struct irq_data *d, unsigned int type)
> +{
> +	return (type == IRQ_TYPE_LEVEL_HIGH) ? 0 : -EINVAL;
> +}
> +
> +static struct irq_chip aic_chip = {
> +	.name = "AIC",
> +	.irq_mask = aic_irq_mask,
> +	.irq_unmask = aic_irq_unmask,

I know these are driven by the higher-level irq chip code, but I'm a bit
confused as to what provides ordering if, e.g. something ends up calling:

	aic_chip.irq_mask(d);
	...
	aic_chip.irq_unmask(d);

I can't see any ISBs in here and they're writing to two different registers,
so can we end up with the IRQ masked after this sequence?

> +/*
> + * IPI irqchip
> + */
> +
> +static void aic_ipi_mask(struct irq_data *d)
> +{
> +	u32 irq_bit = BIT(irqd_to_hwirq(d));
> +	int this_cpu = smp_processor_id();
> +
> +	/* No specific ordering requirements needed here. */
> +	atomic_andnot(irq_bit, &aic_vipi_enable[this_cpu]);
> +}

Why not use a per-cpu variable here instead of an array of atomics? The pcpu
API has things for atomic updates (e.g. or, and, xchg).

> +static void aic_ipi_unmask(struct irq_data *d)
> +{
> +	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
> +	u32 irq_bit = BIT(irqd_to_hwirq(d));
> +	int this_cpu = smp_processor_id();
> +
> +	/*
> +	 * This must complete before the atomic_read_acquire() below to avoid
> +	 * racing aic_ipi_send_mask(). Use a dummy fetch op with release
> +	 * semantics for this. This is arch-specific: ARMv8 B2.3.3 specifies
> +	 * that writes with Release semantics are Barrier-ordered-before reads
> +	 * with Acquire semantics, even though the Linux arch-independent
> +	 * definition of these atomic ops does not.
> +	 */

I think a more idiomatic (and portable) way to do this would be to use
the relaxed accessors, but with smp_mb__after_atomic() between them. Do you
have a good reason for _not_ doing it like that?

> +	(void)atomic_fetch_or_release(irq_bit, &aic_vipi_enable[this_cpu]);
> +
> +	/*
> +	 * If a pending vIPI was unmasked, raise a HW IPI to ourselves.
> +	 * No barriers needed here since this is a self-IPI.
> +	 */
> +	if (atomic_read_acquire(&aic_vipi_flag[this_cpu]) & irq_bit)

"No barriers needed here" right before an acquire is confusing ;)

> +		aic_ic_write(ic, AIC_IPI_SEND, AIC_IPI_SEND_CPU(this_cpu));
> +}
> +
> +static void aic_ipi_send_mask(struct irq_data *d, const struct cpumask *mask)
> +{
> +	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
> +	u32 irq_bit = BIT(irqd_to_hwirq(d));
> +	u32 send = 0;
> +	int cpu;
> +	unsigned long pending;
> +
> +	for_each_cpu(cpu, mask) {
> +		/*
> +		 * This sequence is the mirror of the one in aic_ipi_unmask();
> +		 * see the comment there. Additionally, release semantics
> +		 * ensure that the vIPI flag set is ordered after any shared
> +		 * memory accesses that precede it. This therefore also pairs
> +		 * with the atomic_fetch_andnot in aic_handle_ipi().
> +		 */
> +		pending = atomic_fetch_or_release(irq_bit, &aic_vipi_flag[cpu]);

(same here)

> +		if (!(pending & irq_bit) && (atomic_read_acquire(&aic_vipi_enable[cpu]) & irq_bit))
> +			send |= AIC_IPI_SEND_CPU(cpu);
> +	}
> +
> +	/*
> +	 * The flag writes must complete before the physical IPI is issued
> +	 * to another CPU. This is implied by the control dependency on
> +	 * the result of atomic_read_acquire() above, which is itself
> +	 * already ordered after the vIPI flag write.
> +	 */
> +	if (send)
> +		aic_ic_write(ic, AIC_IPI_SEND, send);
> +}
> +
> +static struct irq_chip ipi_chip = {
> +	.name = "AIC-IPI",
> +	.irq_mask = aic_ipi_mask,
> +	.irq_unmask = aic_ipi_unmask,
> +	.ipi_send_mask = aic_ipi_send_mask,
> +};
> +
> +/*
> + * IPI IRQ domain
> + */
> +
> +static void aic_handle_ipi(struct pt_regs *regs)
> +{
> +	int this_cpu = smp_processor_id();
> +	int i;
> +	unsigned long enabled, firing;
> +
> +	/*
> +	 * Ack the IPI. We need to order this after the AIC event read, but
> +	 * that is enforced by normal MMIO ordering guarantees.
> +	 */
> +	aic_ic_write(aic_irqc, AIC_IPI_ACK, AIC_IPI_OTHER);
> +
> +	/*
> +	 * The mask read does not need to be ordered. Only we can change
> +	 * our own mask anyway, so no races are possible here, as long as
> +	 * we are properly in the interrupt handler (which is covered by
> +	 * the barrier that is part of the top-level AIC handler's readl()).
> +	 */
> +	enabled = atomic_read(&aic_vipi_enable[this_cpu]);
> +
> +	/*
> +	 * Clear the IPIs we are about to handle. This pairs with the
> +	 * atomic_fetch_or_release() in aic_ipi_send_mask(), and needs to be
> +	 * ordered after the aic_ic_write() above (to avoid dropping vIPIs) and
> +	 * before IPI handling code (to avoid races handling vIPIs before they
> +	 * are signaled). The former is taken care of by the release semantics
> +	 * of the write portion, while the latter is taken care of by the
> +	 * acquire semantics of the read portion.
> +	 */
> +	firing = atomic_fetch_andnot(enabled, &aic_vipi_flag[this_cpu]) & enabled;

Does this also need to be ordered after the Ack? For example, if we have
something like:

CPU 0						CPU 1
						<some other IPI>
aic_ipi_send_mask()
						atomic_fetch_andnot(flag)
	atomic_fetch_or_release(flag)
	aic_ic_write(AIC_IPI_SEND)
						aic_ic_write(AIC_IPI_ACK)

sorry if it's a stupid question, I'm just not sure about the cases in which
the hardware will pend things for you.

Will
Hector Martin March 26, 2021, 7:57 a.m. UTC | #6
On 08/03/2021 22.31, Marc Zyngier wrote:
>> +	if ((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EN) &&
>> +		read_sysreg_s(SYS_ICH_MISR_EL2) != 0) {
>> +		pr_err("vGIC IRQ fired, disabling.\n");
> 
> Please add a _ratelimited here. Whilst debugging KVM on this machine,
> I ended up with this firing at such a rate that it was impossible to
> do anything useful. Ratelimiting it allowed me to pinpoint the
> problem.

Ouch. Done for v4.

>> +static void aic_fiq_eoi(struct irq_data *d)
>> +{
>> +	/* We mask to ack (where we can), so we need to unmask at EOI. */
>> +	if (!irqd_irq_disabled(d) && !irqd_irq_masked(d))
> 
> Ah, be careful here: irqd_irq_masked() doesn't do what you think it
> does for per-CPU interrupts. It's been on my list to fix for the rVIC
> implementation, but I never got around to doing it, and all decent ICs
> hide this from SW by having a HW-managed mask, similar to what is on
> the IRQ side.
> 
> I can see two possibilities:
> 
> - you can track the masked state directly and use that instead of
>    these predicates
> 
> - you can just drop the masking altogether as this is only useful to a
>    hosted hypervisor (KVM), which will have to do its own masking
>    behind the scenes anyway
> 

Since you're using the masking in KVM after all, I'm tracking the mask 
state in a percpu variable now. Also folded in your two minor bugfixes 
from the KVM series. Cheers!
Hector Martin March 26, 2021, 8:58 a.m. UTC | #7
Hi Will,

No worries, I was busy until a couple days ago anyway.

On 25/03/2021 04.57, Will Deacon wrote:
>> +#define pr_fmt(fmt) "%s: " fmt, __func__
> 
> General nit: but I suspect many of the prints in here probably want to be
> using the *_ratelimited variants.

You're right, everything complaining about spurious/unsupported stuff is 
probably safer rate-limited. I also made them all pr_err_ratelimited, 
since nothing should be trying to use that hardware before we get around 
to supporting it in this driver.

>> +static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs)
>> +{
>> +	struct aic_irq_chip *ic = aic_irqc;
>> +	u32 event, type, irq;
>> +
>> +	do {
>> +		/*
>> +		 * We cannot use a relaxed read here, as DMA needs to be
>> +		 * ordered with respect to the IRQ firing.
>> +		 */
> 
> I think this could be a bit clearer: the readl() doesn't order any DMA
> accesses, but instead means that subsequent reads by the CPU are ordered
> (which may be from a buffer which was DMA'd to) are ordered after the
> read of the MMIO register.

Good point, reworded it for v4:

/*
  * We cannot use a relaxed read here, as reads from DMA buffers
  * need to be ordered after the IRQ fires.
  */

> 
>> +		event = readl(ic->base + AIC_EVENT);
>> +		type = FIELD_GET(AIC_EVENT_TYPE, event);
>> +		irq = FIELD_GET(AIC_EVENT_NUM, event);
>> +
>> +		if (type == AIC_EVENT_TYPE_HW)
>> +			handle_domain_irq(aic_irqc->hw_domain, irq, regs);
>> +		else if (type == AIC_EVENT_TYPE_IPI && irq == 1)
>> +			aic_handle_ipi(regs);
>> +		else if (event != 0)
>> +			pr_err("Unknown IRQ event %d, %d\n", type, irq);
>> +	} while (event);
>> +
>> +	/*
>> +	 * vGIC maintenance interrupts end up here too, so we need to check
>> +	 * for them separately. Just report and disable vGIC for now, until
>> +	 * we implement this properly.
>> +	 */
>> +	if ((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EN) &&
>> +		read_sysreg_s(SYS_ICH_MISR_EL2) != 0) {
>> +		pr_err("vGIC IRQ fired, disabling.\n");
>> +		sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EN, 0);
>> +	}
> 
> What prevents all these system register accesses being speculated up before
> the handler?

Nothing, but that's not a problem, is it? If the condition is met, it 
means the vGIC IRQ *is* firing and needs clearing. We don't particularly 
care if this happens before, after, or during the rest of the IRQ handling.

I changed the message to this, because we actually should never hit this 
path with correctly-working KVM code (it takes care of it before this 
handler runs):

pr_err_ratelimited("vGIC IRQ fired and not handled by KVM, disabling.\n");

>> +static struct irq_chip aic_chip = {
>> +	.name = "AIC",
>> +	.irq_mask = aic_irq_mask,
>> +	.irq_unmask = aic_irq_unmask,
> 
> I know these are driven by the higher-level irq chip code, but I'm a bit
> confused as to what provides ordering if, e.g. something ends up calling:
> 
> 	aic_chip.irq_mask(d);
> 	...
> 	aic_chip.irq_unmask(d);
> 
> I can't see any ISBs in here and they're writing to two different registers,
> so can we end up with the IRQ masked after this sequence?

Wait, aren't MMIO writes to the same peripheral using device-nGnRnE 
memory modes always ordered with respect to each other? I thought the 
_relaxed versions were only trouble when mixed with memory/DMA buffers, 
and MMIO for any given peripheral always takes effect in program order.

>> +static void aic_ipi_mask(struct irq_data *d)
>> +{
>> +	u32 irq_bit = BIT(irqd_to_hwirq(d));
>> +	int this_cpu = smp_processor_id();
>> +
>> +	/* No specific ordering requirements needed here. */
>> +	atomic_andnot(irq_bit, &aic_vipi_enable[this_cpu]);
>> +}
> 
> Why not use a per-cpu variable here instead of an array of atomics? The pcpu
> API has things for atomic updates (e.g. or, and, xchg).

One CPU still needs to be able to mutate the flags of another CPU to 
fire an IPI; AIUI the per-cpu ops are *not* atomic for concurrent access 
by multiple CPUs, and in fact there is no API for that, only for "this CPU".

>> +static void aic_ipi_unmask(struct irq_data *d)
>> +{
>> +	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
>> +	u32 irq_bit = BIT(irqd_to_hwirq(d));
>> +	int this_cpu = smp_processor_id();
>> +
>> +	/*
>> +	 * This must complete before the atomic_read_acquire() below to avoid
>> +	 * racing aic_ipi_send_mask(). Use a dummy fetch op with release
>> +	 * semantics for this. This is arch-specific: ARMv8 B2.3.3 specifies
>> +	 * that writes with Release semantics are Barrier-ordered-before reads
>> +	 * with Acquire semantics, even though the Linux arch-independent
>> +	 * definition of these atomic ops does not.
>> +	 */
> 
> I think a more idiomatic (and portable) way to do this would be to use
> the relaxed accessors, but with smp_mb__after_atomic() between them. Do you
> have a good reason for _not_ doing it like that?

Not particularly, other than symmetry with the case below.

>> +		/*
>> +		 * This sequence is the mirror of the one in aic_ipi_unmask();
>> +		 * see the comment there. Additionally, release semantics
>> +		 * ensure that the vIPI flag set is ordered after any shared
>> +		 * memory accesses that precede it. This therefore also pairs
>> +		 * with the atomic_fetch_andnot in aic_handle_ipi().
>> +		 */
>> +		pending = atomic_fetch_or_release(irq_bit, &aic_vipi_flag[cpu]);

We do need the return data here, and the release semantics (or another 
barrier before it). But the read below can be made relaxed and a barrier 
used instead, and then the same patern above except with a plain 
atomic_or().

>> +		if (!(pending & irq_bit) && (atomic_read_acquire(&aic_vipi_enable[cpu]) & irq_bit))
>> +			send |= AIC_IPI_SEND_CPU(cpu);
>> +	}

[...]

>> +	/*
>> +	 * Clear the IPIs we are about to handle. This pairs with the
>> +	 * atomic_fetch_or_release() in aic_ipi_send_mask(), and needs to be
>> +	 * ordered after the aic_ic_write() above (to avoid dropping vIPIs) and
>> +	 * before IPI handling code (to avoid races handling vIPIs before they
>> +	 * are signaled). The former is taken care of by the release semantics
>> +	 * of the write portion, while the latter is taken care of by the
>> +	 * acquire semantics of the read portion.
>> +	 */
>> +	firing = atomic_fetch_andnot(enabled, &aic_vipi_flag[this_cpu]) & enabled;
> 
> Does this also need to be ordered after the Ack? For example, if we have
> something like:
> 
> CPU 0						CPU 1
> 						<some other IPI>
> aic_ipi_send_mask()
> 						atomic_fetch_andnot(flag)
> 	atomic_fetch_or_release(flag)
> 	aic_ic_write(AIC_IPI_SEND)
> 						aic_ic_write(AIC_IPI_ACK)
> 
> sorry if it's a stupid question, I'm just not sure about the cases in which
> the hardware will pend things for you.

It is ordered, right? As the comment says, it "needs to be ordered after 
the aic_ic_write() above". atomic_fetch_andnot() is *supposed* to be 
fully ordered and that should include against the writel_relaxed() on 
AIC_IPI_FLAG. On ARM it turns out it's not quite fully ordered, but the 
acquire semantics of the read half are sufficient for this case, as they 
guarantee the flags are always read after the FIQ has been ACKed.

Cheeers,
Hector Martin March 26, 2021, 1:40 p.m. UTC | #8
On 06/03/2021 00.05, Andy Shevchenko wrote:
>> +#define pr_fmt(fmt) "%s: " fmt, __func__
> 
> This is not needed, really, if you have unique / distinguishable
> messages in the first place.
> Rather people include module names, which may be useful.

Makes sense, I'll switch to KBUILD_MODNAME.

>> +#define MASK_BIT(x)            BIT((x) & 0x1f)
> 
> GENMASK(4,0)

It's not really a register bitmask, but rather extracting the low bits 
of an index... but sure, GENMASK also expresses that. Changed.

>> +static atomic_t aic_vipi_flag[AIC_MAX_CPUS];
>> +static atomic_t aic_vipi_enable[AIC_MAX_CPUS];
> 
> Isn't it easier to handle these when they are full width, i.e. 32
> items per the array?

I don't think so, it doesn't really buy us anything. It's just a maximum 
beyond which the driver doesn't work in its current state anyway (if the 
number were much larger it'd make sense to dynamically allocate these, 
but not at this point).

>> +static int aic_irq_set_affinity(struct irq_data *d,
>> +                               const struct cpumask *mask_val, bool force)
>> +{
>> +       irq_hw_number_t hwirq = irqd_to_hwirq(d);
>> +       struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
>> +       int cpu;
>> +
>> +       if (hwirq > ic->nr_hw)
> 
> >= ?

Good catch, but this is actually obsolete. Higher IRQs go into the FIQ 
irqchip, so this should never happen (it's a leftover from when they 
were a single one). I'll remove it.

Ack on the other comments, thanks!
Will Deacon March 29, 2021, 12:04 p.m. UTC | #9
Hi Hector,

On Fri, Mar 26, 2021 at 05:58:15PM +0900, Hector Martin wrote:
> On 25/03/2021 04.57, Will Deacon wrote:
> > > +		event = readl(ic->base + AIC_EVENT);
> > > +		type = FIELD_GET(AIC_EVENT_TYPE, event);
> > > +		irq = FIELD_GET(AIC_EVENT_NUM, event);
> > > +
> > > +		if (type == AIC_EVENT_TYPE_HW)
> > > +			handle_domain_irq(aic_irqc->hw_domain, irq, regs);
> > > +		else if (type == AIC_EVENT_TYPE_IPI && irq == 1)
> > > +			aic_handle_ipi(regs);
> > > +		else if (event != 0)
> > > +			pr_err("Unknown IRQ event %d, %d\n", type, irq);
> > > +	} while (event);
> > > +
> > > +	/*
> > > +	 * vGIC maintenance interrupts end up here too, so we need to check
> > > +	 * for them separately. Just report and disable vGIC for now, until
> > > +	 * we implement this properly.
> > > +	 */
> > > +	if ((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EN) &&
> > > +		read_sysreg_s(SYS_ICH_MISR_EL2) != 0) {
> > > +		pr_err("vGIC IRQ fired, disabling.\n");
> > > +		sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EN, 0);
> > > +	}
> > 
> > What prevents all these system register accesses being speculated up before
> > the handler?
> 
> Nothing, but that's not a problem, is it? If the condition is met, it means
> the vGIC IRQ *is* firing and needs clearing. We don't particularly care if
> this happens before, after, or during the rest of the IRQ handling.
> 
> I changed the message to this, because we actually should never hit this
> path with correctly-working KVM code (it takes care of it before this
> handler runs):
> 
> pr_err_ratelimited("vGIC IRQ fired and not handled by KVM, disabling.\n");

Looks good.

> > > +static struct irq_chip aic_chip = {
> > > +	.name = "AIC",
> > > +	.irq_mask = aic_irq_mask,
> > > +	.irq_unmask = aic_irq_unmask,
> > 
> > I know these are driven by the higher-level irq chip code, but I'm a bit
> > confused as to what provides ordering if, e.g. something ends up calling:
> > 
> > 	aic_chip.irq_mask(d);
> > 	...
> > 	aic_chip.irq_unmask(d);
> > 
> > I can't see any ISBs in here and they're writing to two different registers,
> > so can we end up with the IRQ masked after this sequence?
> 
> Wait, aren't MMIO writes to the same peripheral using device-nGnRnE memory
> modes always ordered with respect to each other? I thought the _relaxed
> versions were only trouble when mixed with memory/DMA buffers, and MMIO for
> any given peripheral always takes effect in program order.

Sorry, this was my mistake -- I seem to have mixed up the MMIO parts with
the system register parts. In this case, aic_irq_[un]mask() are MMIO writes,
so things work out. It's the FIQ mask/unmask code that needs the ISBs.

> > > +static void aic_ipi_mask(struct irq_data *d)
> > > +{
> > > +	u32 irq_bit = BIT(irqd_to_hwirq(d));
> > > +	int this_cpu = smp_processor_id();
> > > +
> > > +	/* No specific ordering requirements needed here. */
> > > +	atomic_andnot(irq_bit, &aic_vipi_enable[this_cpu]);
> > > +}
> > 
> > Why not use a per-cpu variable here instead of an array of atomics? The pcpu
> > API has things for atomic updates (e.g. or, and, xchg).
> 
> One CPU still needs to be able to mutate the flags of another CPU to fire an
> IPI; AIUI the per-cpu ops are *not* atomic for concurrent access by multiple
> CPUs, and in fact there is no API for that, only for "this CPU".

Huh, I really thought we had an API for that, but you're right. Oh well! But
I'd still suggest a per-cpu atomic_t in that case, rather than the array.

> > > +static void aic_ipi_unmask(struct irq_data *d)
> > > +{
> > > +	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
> > > +	u32 irq_bit = BIT(irqd_to_hwirq(d));
> > > +	int this_cpu = smp_processor_id();
> > > +
> > > +	/*
> > > +	 * This must complete before the atomic_read_acquire() below to avoid
> > > +	 * racing aic_ipi_send_mask(). Use a dummy fetch op with release
> > > +	 * semantics for this. This is arch-specific: ARMv8 B2.3.3 specifies
> > > +	 * that writes with Release semantics are Barrier-ordered-before reads
> > > +	 * with Acquire semantics, even though the Linux arch-independent
> > > +	 * definition of these atomic ops does not.
> > > +	 */
> > 
> > I think a more idiomatic (and portable) way to do this would be to use
> > the relaxed accessors, but with smp_mb__after_atomic() between them. Do you
> > have a good reason for _not_ doing it like that?
> 
> Not particularly, other than symmetry with the case below.

I think it would be better not to rely on arm64-specific ordering unless
there's a good reason to.

> > > +		/*
> > > +		 * This sequence is the mirror of the one in aic_ipi_unmask();
> > > +		 * see the comment there. Additionally, release semantics
> > > +		 * ensure that the vIPI flag set is ordered after any shared
> > > +		 * memory accesses that precede it. This therefore also pairs
> > > +		 * with the atomic_fetch_andnot in aic_handle_ipi().
> > > +		 */
> > > +		pending = atomic_fetch_or_release(irq_bit, &aic_vipi_flag[cpu]);
> 
> We do need the return data here, and the release semantics (or another
> barrier before it). But the read below can be made relaxed and a barrier
> used instead, and then the same patern above except with a plain
> atomic_or().

Yes, I think using atomic_fetch_or() followed by atomic_read() would be
best (obviously with the relevant comments!)

> 
> > > +		if (!(pending & irq_bit) && (atomic_read_acquire(&aic_vipi_enable[cpu]) & irq_bit))
> > > +			send |= AIC_IPI_SEND_CPU(cpu);
> > > +	}
> 
> [...]
> 
> > > +	/*
> > > +	 * Clear the IPIs we are about to handle. This pairs with the
> > > +	 * atomic_fetch_or_release() in aic_ipi_send_mask(), and needs to be
> > > +	 * ordered after the aic_ic_write() above (to avoid dropping vIPIs) and
> > > +	 * before IPI handling code (to avoid races handling vIPIs before they
> > > +	 * are signaled). The former is taken care of by the release semantics
> > > +	 * of the write portion, while the latter is taken care of by the
> > > +	 * acquire semantics of the read portion.
> > > +	 */
> > > +	firing = atomic_fetch_andnot(enabled, &aic_vipi_flag[this_cpu]) & enabled;
> > 
> > Does this also need to be ordered after the Ack? For example, if we have
> > something like:
> > 
> > CPU 0						CPU 1
> > 						<some other IPI>
> > aic_ipi_send_mask()
> > 						atomic_fetch_andnot(flag)
> > 	atomic_fetch_or_release(flag)
> > 	aic_ic_write(AIC_IPI_SEND)
> > 						aic_ic_write(AIC_IPI_ACK)
> > 
> > sorry if it's a stupid question, I'm just not sure about the cases in which
> > the hardware will pend things for you.
> 
> It is ordered, right? As the comment says, it "needs to be ordered after the
> aic_ic_write() above". atomic_fetch_andnot() is *supposed* to be fully
> ordered and that should include against the writel_relaxed() on
> AIC_IPI_FLAG. On ARM it turns out it's not quite fully ordered, but the
> acquire semantics of the read half are sufficient for this case, as they
> guarantee the flags are always read after the FIQ has been ACKed.

Sorry, I missed that the answer to my question was already written in the
comment. However, I'm still a bit unsure about whether the memory barriers
give you what you need here. The barrier in atomic_fetch_andnot() will
order the previous aic_ic_write(AIC_IPI_ACK) for the purposes of other
CPUs reading those locations, but it doesn't say anything about when the
interrupt controller actually changes state after the Ack.

Given that the AIC is mapped Device-nGnRnE, the Arm ARM offers:

  | Additionally, for Device-nGnRnE memory, a read or write of a Location
  | in a Memory-mapped peripheral that exhibits side-effects is complete
  | only when the read or write both:
  |
  | * Can begin to affect the state of the Memory-mapped peripheral.
  | * Can trigger all associated side-effects, whether they affect other
  |   peripheral devices, PEs, or memory.

so without AIC documentation I can't tell whether completion of the Ack write
just begins the process of an Ack (in which case we might need something like
a read-back), or whether the write response back from the AIC only occurs once
the Ack has taken effect. Any ideas?

> Cheeers,

No prooblem :)

Will
Hector Martin April 1, 2021, 1:16 p.m. UTC | #10
Hi Will,

On 29/03/2021 21.04, Will Deacon wrote:
>> One CPU still needs to be able to mutate the flags of another CPU to fire an
>> IPI; AIUI the per-cpu ops are *not* atomic for concurrent access by multiple
>> CPUs, and in fact there is no API for that, only for "this CPU".
> 
> Huh, I really thought we had an API for that, but you're right. Oh well! But
> I'd still suggest a per-cpu atomic_t in that case, rather than the array.

Yeah, after digging into the per-cpu stuff earlier and understanding how 
it works, I agree that a per-cpu atomic makes sense here. Switched it to 
that (which simplified out a bunch of smp_processor_id() calls too). Thanks!

>>> I think a more idiomatic (and portable) way to do this would be to use
>>> the relaxed accessors, but with smp_mb__after_atomic() between them. Do you
>>> have a good reason for _not_ doing it like that?
>>
>> Not particularly, other than symmetry with the case below.
> 
> I think it would be better not to rely on arm64-specific ordering unless
> there's a good reason to.

Sounds reasonable, I'll switch to the barrier version.

>> We do need the return data here, and the release semantics (or another
>> barrier before it). But the read below can be made relaxed and a barrier
>> used instead, and then the same patern above except with a plain
>> atomic_or().
> 
> Yes, I think using atomic_fetch_or() followed by atomic_read() would be
> best (obviously with the relevant comments!)

atomic_fetch_or_release is sufficient here (atomic_fetch_or is stronger; 
atomic_fetch_or_relaxed would not be strong enough as this needs to be 
ordered after any writes prior to sending the IPI; in this case release 
semantics also make logical sense).

>> It is ordered, right? As the comment says, it "needs to be ordered after the
>> aic_ic_write() above". atomic_fetch_andnot() is *supposed* to be fully
>> ordered and that should include against the writel_relaxed() on
>> AIC_IPI_FLAG. On ARM it turns out it's not quite fully ordered, but the
>> acquire semantics of the read half are sufficient for this case, as they
>> guarantee the flags are always read after the FIQ has been ACKed.
> 
> Sorry, I missed that the answer to my question was already written in the
> comment. However, I'm still a bit unsure about whether the memory barriers
> give you what you need here. The barrier in atomic_fetch_andnot() will
> order the previous aic_ic_write(AIC_IPI_ACK) for the purposes of other
> CPUs reading those locations, but it doesn't say anything about when the
> interrupt controller actually changes state after the Ack.
> 
> Given that the AIC is mapped Device-nGnRnE, the Arm ARM offers:
> 
>    | Additionally, for Device-nGnRnE memory, a read or write of a Location
>    | in a Memory-mapped peripheral that exhibits side-effects is complete
>    | only when the read or write both:
>    |
>    | * Can begin to affect the state of the Memory-mapped peripheral.
>    | * Can trigger all associated side-effects, whether they affect other
>    |   peripheral devices, PEs, or memory.
> 
> so without AIC documentation I can't tell whether completion of the Ack write
> just begins the process of an Ack (in which case we might need something like
> a read-back), or whether the write response back from the AIC only occurs once
> the Ack has taken effect. Any ideas?

Ahh, you're talking about latency within AIC itself... I obviously don't 
have an authoritative answer to this, though the hardware designer in me 
wants to say this really ought to be single-cycle type stuff that isn't 
internally pipelined in a way that would create races.

I tried to set up an SMP test case for the atomic-to-AIC sequence in 
m1n1, but unfortunately I couldn't hit the race window in deliberately 
racy code (i.e. ack after clearing flags) without widening it even 
further with at least one dummy load in between, and of course I didn't 
experience any races with the proper code either.

What I can say is that a simple set IPI; ack IPI (in adjacent str 
instructions) sequence always yields a cleared IPI, and the converse 
always yields a set IPI. So if there is latency to the operations it 
seems it would at least be the same for sets and acks and would imply 
readbacks block, which should still yield equivalently correct results. 
But of course this is a single-CPU test, so it is not fully 
representative of what could happen in an SMP scenario.

At this point all I can say is I'm inclined to shrug and say we have no 
evidence of this being something that can happen, and it shouldn't in 
sane hardware, and hope for the best :-)

Thanks,
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 744e086d28cd..28bd46f4f7a7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1648,6 +1648,8 @@  T:	git https://github.com/AsahiLinux/linux.git
 F:	Documentation/devicetree/bindings/arm/apple.yaml
 F:	Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml
 F:	arch/arm64/include/asm/sysreg_apple.h
+F:	drivers/irqchip/irq-apple-aic.c
+F:	include/dt-bindings/interrupt-controller/apple-aic.h
 
 ARM/ARTPEC MACHINE SUPPORT
 M:	Jesper Nilsson <jesper.nilsson@axis.com>
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 15536e321df5..d3a14f304ec8 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -577,4 +577,12 @@  config MST_IRQ
 	help
 	  Support MStar Interrupt Controller.
 
+config APPLE_AIC
+	bool "Apple Interrupt Controller (AIC)"
+	depends on ARM64
+	default ARCH_APPLE
+	help
+	  Support for the Apple Interrupt Controller found on Apple Silicon SoCs,
+	  such as the M1.
+
 endmenu
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index c59b95a0532c..eb6a515f0f64 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -113,3 +113,4 @@  obj-$(CONFIG_LOONGSON_PCH_MSI)		+= irq-loongson-pch-msi.o
 obj-$(CONFIG_MST_IRQ)			+= irq-mst-intc.o
 obj-$(CONFIG_SL28CPLD_INTC)		+= irq-sl28cpld.o
 obj-$(CONFIG_MACH_REALTEK_RTL)		+= irq-realtek-rtl.o
+obj-$(CONFIG_APPLE_AIC)			+= irq-apple-aic.o
diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c
new file mode 100644
index 000000000000..ddc0856f36a5
--- /dev/null
+++ b/drivers/irqchip/irq-apple-aic.c
@@ -0,0 +1,710 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright The Asahi Linux Contributors
+ *
+ * Based on irq-lpc32xx:
+ *   Copyright 2015-2016 Vladimir Zapolskiy <vz@mleia.com>
+ * Based on irq-bcm2836:
+ *   Copyright 2015 Broadcom
+ */
+
+/*
+ * AIC is a fairly simple interrupt controller with the following features:
+ *
+ * - 896 level-triggered hardware IRQs
+ *   - Single mask bit per IRQ
+ *   - Per-IRQ affinity setting
+ *   - Automatic masking on event delivery (auto-ack)
+ *   - Software triggering (ORed with hw line)
+ * - 2 per-CPU IPIs (meant as "self" and "other", but they are
+ *   interchangeable if not symmetric)
+ * - Automatic prioritization (single event/ack register per CPU, lower IRQs =
+ *   higher priority)
+ * - Automatic masking on ack
+ * - Default "this CPU" register view and explicit per-CPU views
+ *
+ * In addition, this driver also handles FIQs, as these are routed to the same
+ * IRQ vector. These are used for Fast IPIs (TODO), the ARMv8 timer IRQs, and
+ * performance counters (TODO).
+ *
+ * Implementation notes:
+ *
+ * - This driver creates two IRQ domains, one for HW IRQs and internal FIQs,
+ *   and one for IPIs.
+ * - Since Linux needs more than 2 IPIs, we implement a software IRQ controller
+ *   and funnel all IPIs into one per-CPU IPI (the second "self" IPI is unused).
+ * - FIQ hwirq numbers are assigned after true hwirqs, and are per-cpu.
+ * - DT bindings use 3-cell form (like GIC):
+ *   - <0 nr flags> - hwirq #nr
+ *   - <1 nr flags> - FIQ #nr
+ *     - nr=0  Physical HV timer
+ *     - nr=1  Virtual HV timer
+ *     - nr=2  Physical guest timer
+ *     - nr=3  Virtual guest timer
+ *
+ */
+
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/bits.h>
+#include <linux/bitfield.h>
+#include <linux/cpuhotplug.h>
+#include <linux/io.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+#include <asm/exception.h>
+#include <asm/sysreg.h>
+#include <asm/sysreg_apple.h>
+
+#include <dt-bindings/interrupt-controller/apple-aic.h>
+
+#define AIC_INFO		0x0004
+#define AIC_INFO_NR_HW		GENMASK(15, 0)
+
+#define AIC_CONFIG		0x0010
+
+#define AIC_WHOAMI		0x2000
+#define AIC_EVENT		0x2004
+#define AIC_EVENT_TYPE		GENMASK(31, 16)
+#define AIC_EVENT_NUM		GENMASK(15, 0)
+
+#define AIC_EVENT_TYPE_HW	1
+#define AIC_EVENT_TYPE_IPI	4
+#define AIC_EVENT_IPI_OTHER	1
+#define AIC_EVENT_IPI_SELF	2
+
+#define AIC_IPI_SEND		0x2008
+#define AIC_IPI_ACK		0x200c
+#define AIC_IPI_MASK_SET	0x2024
+#define AIC_IPI_MASK_CLR	0x2028
+
+#define AIC_IPI_SEND_CPU(cpu)	BIT(cpu)
+
+#define AIC_IPI_OTHER		BIT(0)
+#define AIC_IPI_SELF		BIT(31)
+
+#define AIC_TARGET_CPU		0x3000
+#define AIC_SW_SET		0x4000
+#define AIC_SW_CLR		0x4080
+#define AIC_MASK_SET		0x4100
+#define AIC_MASK_CLR		0x4180
+
+#define AIC_CPU_IPI_SET(cpu)	(0x5008 + ((cpu) << 7))
+#define AIC_CPU_IPI_CLR(cpu)	(0x500c + ((cpu) << 7))
+#define AIC_CPU_IPI_MASK_SET(cpu) (0x5024 + ((cpu) << 7))
+#define AIC_CPU_IPI_MASK_CLR(cpu) (0x5028 + ((cpu) << 7))
+
+#define MASK_REG(x)		(4 * ((x) >> 5))
+#define MASK_BIT(x)		BIT((x) & 0x1f)
+
+#define AIC_NR_FIQ		4
+#define AIC_NR_SWIPI		32
+
+/*
+ * Max 31 bits in IPI SEND register (top bit is self).
+ * >=32-core chips will need code changes anyway.
+ */
+#define AIC_MAX_CPUS		31
+
+struct aic_irq_chip {
+	void __iomem *base;
+	struct irq_domain *hw_domain;
+	struct irq_domain *ipi_domain;
+	int nr_hw;
+	int ipi_hwirq;
+};
+
+static atomic_t aic_vipi_flag[AIC_MAX_CPUS];
+static atomic_t aic_vipi_enable[AIC_MAX_CPUS];
+
+static struct aic_irq_chip *aic_irqc;
+
+static void aic_handle_ipi(struct pt_regs *regs);
+
+static u32 aic_ic_read(struct aic_irq_chip *ic, u32 reg)
+{
+	return readl_relaxed(ic->base + reg);
+}
+
+static void aic_ic_write(struct aic_irq_chip *ic, u32 reg, u32 val)
+{
+	writel_relaxed(val, ic->base + reg);
+}
+
+/*
+ * IRQ irqchip
+ */
+
+static void aic_irq_mask(struct irq_data *d)
+{
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+
+	aic_ic_write(ic, AIC_MASK_SET + MASK_REG(d->hwirq),
+		     MASK_BIT(d->hwirq));
+}
+
+static void aic_irq_unmask(struct irq_data *d)
+{
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+
+	aic_ic_write(ic, AIC_MASK_CLR + MASK_REG(d->hwirq),
+		     MASK_BIT(d->hwirq));
+}
+
+static void aic_irq_eoi(struct irq_data *d)
+{
+	/*
+	 * Reading the interrupt reason automatically acknowledges and masks
+	 * the IRQ, so we just unmask it here if needed.
+	 */
+	if (!irqd_irq_disabled(d) && !irqd_irq_masked(d))
+		aic_irq_unmask(d);
+}
+
+static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs)
+{
+	struct aic_irq_chip *ic = aic_irqc;
+	u32 event, type, irq;
+
+	do {
+		/*
+		 * We cannot use a relaxed read here, as DMA needs to be
+		 * ordered with respect to the IRQ firing.
+		 */
+		event = readl(ic->base + AIC_EVENT);
+		type = FIELD_GET(AIC_EVENT_TYPE, event);
+		irq = FIELD_GET(AIC_EVENT_NUM, event);
+
+		if (type == AIC_EVENT_TYPE_HW)
+			handle_domain_irq(aic_irqc->hw_domain, irq, regs);
+		else if (type == AIC_EVENT_TYPE_IPI && irq == 1)
+			aic_handle_ipi(regs);
+		else if (event != 0)
+			pr_err("Unknown IRQ event %d, %d\n", type, irq);
+	} while (event);
+
+	/*
+	 * vGIC maintenance interrupts end up here too, so we need to check
+	 * for them separately. Just report and disable vGIC for now, until
+	 * we implement this properly.
+	 */
+	if ((read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EN) &&
+		read_sysreg_s(SYS_ICH_MISR_EL2) != 0) {
+		pr_err("vGIC IRQ fired, disabling.\n");
+		sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EN, 0);
+	}
+}
+
+static int aic_irq_set_affinity(struct irq_data *d,
+				const struct cpumask *mask_val, bool force)
+{
+	irq_hw_number_t hwirq = irqd_to_hwirq(d);
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+	int cpu;
+
+	if (hwirq > ic->nr_hw)
+		return -EINVAL;
+
+	if (force)
+		cpu = cpumask_first(mask_val);
+	else
+		cpu = cpumask_any_and(mask_val, cpu_online_mask);
+
+	aic_ic_write(ic, AIC_TARGET_CPU + hwirq * 4, BIT(cpu));
+	irq_data_update_effective_affinity(d, cpumask_of(cpu));
+
+	return IRQ_SET_MASK_OK;
+}
+
+static int aic_irq_set_type(struct irq_data *d, unsigned int type)
+{
+	return (type == IRQ_TYPE_LEVEL_HIGH) ? 0 : -EINVAL;
+}
+
+static struct irq_chip aic_chip = {
+	.name = "AIC",
+	.irq_mask = aic_irq_mask,
+	.irq_unmask = aic_irq_unmask,
+	.irq_eoi = aic_irq_eoi,
+	.irq_set_affinity = aic_irq_set_affinity,
+	.irq_set_type = aic_irq_set_type,
+};
+
+/*
+ * FIQ irqchip
+ */
+
+static void aic_fiq_mask(struct irq_data *d)
+{
+	/* Only the guest timers have real mask bits, unfortunately. */
+	switch (d->hwirq) {
+	case AIC_TMR_GUEST_PHYS:
+		sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, VM_TMR_FIQ_ENABLE_P, 0);
+		break;
+	case AIC_TMR_GUEST_VIRT:
+		sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, VM_TMR_FIQ_ENABLE_V, 0);
+		break;
+	}
+}
+
+static void aic_fiq_unmask(struct irq_data *d)
+{
+	switch (d->hwirq) {
+	case AIC_TMR_GUEST_PHYS:
+		sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, 0, VM_TMR_FIQ_ENABLE_P);
+		break;
+	case AIC_TMR_GUEST_VIRT:
+		sysreg_clear_set_s(SYS_APL_VM_TMR_FIQ_ENA_EL1, 0, VM_TMR_FIQ_ENABLE_V);
+		break;
+	}
+}
+
+static void aic_fiq_eoi(struct irq_data *d)
+{
+	/* We mask to ack (where we can), so we need to unmask at EOI. */
+	if (!irqd_irq_disabled(d) && !irqd_irq_masked(d))
+		aic_fiq_unmask(d);
+}
+
+#define TIMER_FIRING(x)                                                        \
+	(((x) & (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_MASK |            \
+		 ARCH_TIMER_CTRL_IT_STAT)) ==                                  \
+	 (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_STAT))
+
+static void __exception_irq_entry aic_handle_fiq(struct pt_regs *regs)
+{
+	/*
+	 * It would be really nice if we had a system register that lets us get
+	 * the FIQ source state without having to peek down into sources...
+	 * but such a register does not seem to exist.
+	 *
+	 * So, we have these potential sources to test for:
+	 *  - Fast IPIs (not yet used)
+	 *  - The 4 timers (CNTP, CNTV for each of HV and guest)
+	 *  - Per-core PMCs (not yet supported)
+	 *  - Per-cluster uncore PMCs (not yet supported)
+	 *
+	 * Since not dealing with any of these results in a FIQ storm,
+	 * we check for everything here, even things we don't support yet.
+	 */
+
+	if (read_sysreg_s(SYS_APL_IPI_SR_EL1) & IPI_SR_PENDING) {
+		pr_warn("Fast IPI fired. Acking.\n");
+		write_sysreg_s(IPI_SR_PENDING, SYS_APL_IPI_SR_EL1);
+	}
+
+	if (TIMER_FIRING(read_sysreg(cntp_ctl_el0)))
+		handle_domain_irq(aic_irqc->hw_domain,
+				  aic_irqc->nr_hw + AIC_TMR_HV_PHYS, regs);
+
+	if (TIMER_FIRING(read_sysreg(cntv_ctl_el0)))
+		handle_domain_irq(aic_irqc->hw_domain,
+				  aic_irqc->nr_hw + AIC_TMR_HV_VIRT, regs);
+
+	if (TIMER_FIRING(read_sysreg_s(SYS_CNTP_CTL_EL02)))
+		handle_domain_irq(aic_irqc->hw_domain,
+				  aic_irqc->nr_hw + AIC_TMR_GUEST_PHYS, regs);
+
+	if (TIMER_FIRING(read_sysreg_s(SYS_CNTV_CTL_EL02)))
+		handle_domain_irq(aic_irqc->hw_domain,
+				  aic_irqc->nr_hw + AIC_TMR_GUEST_VIRT, regs);
+
+	if ((read_sysreg_s(SYS_APL_PMCR0_EL1) & (PMCR0_IMODE | PMCR0_IACT))
+			== (FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_FIQ) | PMCR0_IACT)) {
+		/*
+		 * Not supported yet, let's figure out how to handle this when
+		 * we implement these proprietary performance counters. For now,
+		 * just mask it and move on.
+		 */
+		pr_warn("PMC FIQ fired. Masking.\n");
+		sysreg_clear_set_s(SYS_APL_PMCR0_EL1, PMCR0_IMODE | PMCR0_IACT,
+				   FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_OFF));
+	}
+
+	if (FIELD_GET(UPMCR0_IMODE, read_sysreg_s(SYS_APL_UPMCR0_EL1)) == UPMCR0_IMODE_FIQ &&
+			(read_sysreg_s(SYS_APL_UPMSR_EL1) & UPMSR_IACT)) {
+		/* Same story with uncore PMCs */
+		pr_warn("Uncore PMC FIQ fired. Masking.\n");
+		sysreg_clear_set_s(SYS_APL_UPMCR0_EL1, UPMCR0_IMODE,
+				   FIELD_PREP(UPMCR0_IMODE, UPMCR0_IMODE_OFF));
+	}
+}
+
+static struct irq_chip fiq_chip = {
+	.name = "AIC-FIQ",
+	.irq_mask = aic_fiq_mask,
+	.irq_unmask = aic_fiq_unmask,
+	.irq_ack = aic_fiq_mask,
+	.irq_eoi = aic_fiq_eoi,
+	.irq_set_type = aic_irq_set_type,
+};
+
+/*
+ * Main IRQ domain
+ */
+
+static int aic_irq_domain_map(struct irq_domain *id, unsigned int irq,
+			      irq_hw_number_t hw)
+{
+	struct aic_irq_chip *ic = id->host_data;
+
+	if (hw < ic->nr_hw) {
+		irq_domain_set_info(id, irq, hw, &aic_chip, id->host_data,
+				    handle_fasteoi_irq, NULL, NULL);
+		irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(irq)));
+	} else {
+		irq_set_percpu_devid(irq);
+		irq_domain_set_info(id, irq, hw, &fiq_chip, id->host_data,
+				    handle_percpu_devid_irq, NULL, NULL);
+	}
+
+	return 0;
+}
+
+static int aic_irq_domain_translate(struct irq_domain *id,
+				    struct irq_fwspec *fwspec,
+				    unsigned long *hwirq,
+				    unsigned int *type)
+{
+	struct aic_irq_chip *ic = id->host_data;
+
+	if (fwspec->param_count != 3 || !is_of_node(fwspec->fwnode))
+		return -EINVAL;
+
+	switch (fwspec->param[0]) {
+	case AIC_IRQ:
+		if (fwspec->param[1] >= ic->nr_hw)
+			return -EINVAL;
+		*hwirq = 0;
+		break;
+	case AIC_FIQ:
+		if (fwspec->param[1] >= AIC_NR_FIQ)
+			return -EINVAL;
+		*hwirq = ic->nr_hw;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*hwirq += fwspec->param[1];
+	*type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+
+	return 0;
+}
+
+static int aic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				unsigned int nr_irqs, void *arg)
+{
+	unsigned int type = IRQ_TYPE_NONE;
+	struct irq_fwspec *fwspec = arg;
+	irq_hw_number_t hwirq;
+	int i, ret;
+
+	ret = aic_irq_domain_translate(domain, fwspec, &hwirq, &type);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nr_irqs; i++) {
+		ret = aic_irq_domain_map(domain, virq + i, hwirq + i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void aic_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+				unsigned int nr_irqs)
+{
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_data *d = irq_domain_get_irq_data(domain, virq + i);
+
+		irq_set_handler(virq + i, NULL);
+		irq_domain_reset_irq_data(d);
+	}
+}
+
+static const struct irq_domain_ops aic_irq_domain_ops = {
+	.translate	= aic_irq_domain_translate,
+	.alloc		= aic_irq_domain_alloc,
+	.free		= aic_irq_domain_free,
+};
+
+/*
+ * IPI irqchip
+ */
+
+static void aic_ipi_mask(struct irq_data *d)
+{
+	u32 irq_bit = BIT(irqd_to_hwirq(d));
+	int this_cpu = smp_processor_id();
+
+	/* No specific ordering requirements needed here. */
+	atomic_andnot(irq_bit, &aic_vipi_enable[this_cpu]);
+}
+
+static void aic_ipi_unmask(struct irq_data *d)
+{
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+	u32 irq_bit = BIT(irqd_to_hwirq(d));
+	int this_cpu = smp_processor_id();
+
+	/*
+	 * This must complete before the atomic_read_acquire() below to avoid
+	 * racing aic_ipi_send_mask(). Use a dummy fetch op with release
+	 * semantics for this. This is arch-specific: ARMv8 B2.3.3 specifies
+	 * that writes with Release semantics are Barrier-ordered-before reads
+	 * with Acquire semantics, even though the Linux arch-independent
+	 * definition of these atomic ops does not.
+	 */
+	(void)atomic_fetch_or_release(irq_bit, &aic_vipi_enable[this_cpu]);
+
+	/*
+	 * If a pending vIPI was unmasked, raise a HW IPI to ourselves.
+	 * No barriers needed here since this is a self-IPI.
+	 */
+	if (atomic_read_acquire(&aic_vipi_flag[this_cpu]) & irq_bit)
+		aic_ic_write(ic, AIC_IPI_SEND, AIC_IPI_SEND_CPU(this_cpu));
+}
+
+static void aic_ipi_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+	u32 irq_bit = BIT(irqd_to_hwirq(d));
+	u32 send = 0;
+	int cpu;
+	unsigned long pending;
+
+	for_each_cpu(cpu, mask) {
+		/*
+		 * This sequence is the mirror of the one in aic_ipi_unmask();
+		 * see the comment there. Additionally, release semantics
+		 * ensure that the vIPI flag set is ordered after any shared
+		 * memory accesses that precede it. This therefore also pairs
+		 * with the atomic_fetch_andnot in aic_handle_ipi().
+		 */
+		pending = atomic_fetch_or_release(irq_bit, &aic_vipi_flag[cpu]);
+
+		if (!(pending & irq_bit) && (atomic_read_acquire(&aic_vipi_enable[cpu]) & irq_bit))
+			send |= AIC_IPI_SEND_CPU(cpu);
+	}
+
+	/*
+	 * The flag writes must complete before the physical IPI is issued
+	 * to another CPU. This is implied by the control dependency on
+	 * the result of atomic_read_acquire() above, which is itself
+	 * already ordered after the vIPI flag write.
+	 */
+	if (send)
+		aic_ic_write(ic, AIC_IPI_SEND, send);
+}
+
+static struct irq_chip ipi_chip = {
+	.name = "AIC-IPI",
+	.irq_mask = aic_ipi_mask,
+	.irq_unmask = aic_ipi_unmask,
+	.ipi_send_mask = aic_ipi_send_mask,
+};
+
+/*
+ * IPI IRQ domain
+ */
+
+static void aic_handle_ipi(struct pt_regs *regs)
+{
+	int this_cpu = smp_processor_id();
+	int i;
+	unsigned long enabled, firing;
+
+	/*
+	 * Ack the IPI. We need to order this after the AIC event read, but
+	 * that is enforced by normal MMIO ordering guarantees.
+	 */
+	aic_ic_write(aic_irqc, AIC_IPI_ACK, AIC_IPI_OTHER);
+
+	/*
+	 * The mask read does not need to be ordered. Only we can change
+	 * our own mask anyway, so no races are possible here, as long as
+	 * we are properly in the interrupt handler (which is covered by
+	 * the barrier that is part of the top-level AIC handler's readl()).
+	 */
+	enabled = atomic_read(&aic_vipi_enable[this_cpu]);
+
+	/*
+	 * Clear the IPIs we are about to handle. This pairs with the
+	 * atomic_fetch_or_release() in aic_ipi_send_mask(), and needs to be
+	 * ordered after the aic_ic_write() above (to avoid dropping vIPIs) and
+	 * before IPI handling code (to avoid races handling vIPIs before they
+	 * are signaled). The former is taken care of by the release semantics
+	 * of the write portion, while the latter is taken care of by the
+	 * acquire semantics of the read portion.
+	 */
+	firing = atomic_fetch_andnot(enabled, &aic_vipi_flag[this_cpu]) & enabled;
+
+	for_each_set_bit(i, &firing, AIC_NR_SWIPI) {
+		handle_domain_irq(aic_irqc->ipi_domain, i, regs);
+	}
+
+	/*
+	 * No ordering needed here; at worst this just changes the timing of
+	 * when the next IPI will be delivered.
+	 */
+	aic_ic_write(aic_irqc, AIC_IPI_MASK_CLR, AIC_IPI_OTHER);
+}
+
+static int aic_ipi_alloc(struct irq_domain *d, unsigned int virq,
+			 unsigned int nr_irqs, void *args)
+{
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_set_percpu_devid(virq + i);
+		irq_domain_set_info(d, virq + i, i, &ipi_chip, d->host_data,
+				    handle_percpu_devid_irq, NULL, NULL);
+	}
+
+	return 0;
+}
+
+static void aic_ipi_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs)
+{
+	/* Not freeing IPIs */
+}
+
+static const struct irq_domain_ops aic_ipi_domain_ops = {
+	.alloc = aic_ipi_alloc,
+	.free = aic_ipi_free,
+};
+
+static int aic_init_smp(struct aic_irq_chip *irqc, struct device_node *node)
+{
+	int base_ipi;
+
+	irqc->ipi_domain = irq_domain_create_linear(irqc->hw_domain->fwnode, AIC_NR_SWIPI,
+						    &aic_ipi_domain_ops, irqc);
+	if (WARN_ON(!irqc->ipi_domain))
+		return -ENODEV;
+
+	irqc->ipi_domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+	irq_domain_update_bus_token(irqc->ipi_domain, DOMAIN_BUS_IPI);
+
+	base_ipi = __irq_domain_alloc_irqs(irqc->ipi_domain, -1, AIC_NR_SWIPI,
+					   NUMA_NO_NODE, NULL, false, NULL);
+
+	if (WARN_ON(!base_ipi)) {
+		irq_domain_remove(irqc->ipi_domain);
+		return -ENODEV;
+	}
+
+	set_smp_ipi_range(base_ipi, AIC_NR_SWIPI);
+
+	return 0;
+}
+
+static int aic_init_cpu(unsigned int cpu)
+{
+	/* Mask all hard-wired per-CPU IRQ/FIQ sources */
+
+	/* vGIC maintenance IRQ */
+	sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EN, 0);
+
+	/* Pending Fast IPI FIQs */
+	write_sysreg_s(IPI_SR_PENDING, SYS_APL_IPI_SR_EL1);
+
+	/* Timer FIQs */
+	sysreg_clear_set(cntp_ctl_el0, 0, ARCH_TIMER_CTRL_IT_MASK);
+	sysreg_clear_set(cntv_ctl_el0, 0, ARCH_TIMER_CTRL_IT_MASK);
+	sysreg_clear_set_s(SYS_CNTP_CTL_EL02, 0, ARCH_TIMER_CTRL_IT_MASK);
+	sysreg_clear_set_s(SYS_CNTV_CTL_EL02, 0, ARCH_TIMER_CTRL_IT_MASK);
+
+	/* PMC FIQ */
+	sysreg_clear_set_s(SYS_APL_PMCR0_EL1, PMCR0_IMODE | PMCR0_IACT,
+			   FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_OFF));
+
+	/* Uncore PMC FIQ */
+	sysreg_clear_set_s(SYS_APL_UPMCR0_EL1, UPMCR0_IMODE,
+			   FIELD_PREP(UPMCR0_IMODE, UPMCR0_IMODE_OFF));
+
+	/*
+	 * Make sure the kernel's idea of logical CPU order is the same as AIC's
+	 * If we ever end up with a mismatch here, we will have to introduce
+	 * a mapping table similar to what other irqchip drivers do.
+	 */
+	WARN_ON(aic_ic_read(aic_irqc, AIC_WHOAMI) != smp_processor_id());
+
+	/*
+	 * Always keep IPIs unmasked at the hardware level (except auto-masking
+	 * by AIC during processing). We manage masks at the vIPI level.
+	 */
+	aic_ic_write(aic_irqc, AIC_IPI_ACK, AIC_IPI_SELF | AIC_IPI_OTHER);
+	aic_ic_write(aic_irqc, AIC_IPI_MASK_SET, AIC_IPI_SELF);
+	aic_ic_write(aic_irqc, AIC_IPI_MASK_CLR, AIC_IPI_OTHER);
+
+	return 0;
+
+}
+
+static int __init aic_of_ic_init(struct device_node *node, struct device_node *parent)
+{
+	int i;
+	void __iomem *regs;
+	u32 info;
+	struct aic_irq_chip *irqc;
+
+	regs = of_iomap(node, 0);
+	if (WARN_ON(!regs))
+		return -EIO;
+
+	irqc = kzalloc(sizeof(*irqc), GFP_KERNEL);
+	if (!irqc)
+		return -ENOMEM;
+
+	aic_irqc = irqc;
+	irqc->base = regs;
+
+	info = aic_ic_read(irqc, AIC_INFO);
+	irqc->nr_hw = FIELD_GET(AIC_INFO_NR_HW, info);
+
+	irqc->hw_domain = irq_domain_create_linear(of_node_to_fwnode(node),
+						   irqc->nr_hw + AIC_NR_FIQ,
+						   &aic_irq_domain_ops, irqc);
+	if (WARN_ON(!irqc->hw_domain)) {
+		iounmap(irqc->base);
+		kfree(irqc);
+		return -ENODEV;
+	}
+
+	irq_domain_update_bus_token(irqc->hw_domain, DOMAIN_BUS_WIRED);
+
+	if (aic_init_smp(irqc, node)) {
+		irq_domain_remove(irqc->hw_domain);
+		iounmap(irqc->base);
+		kfree(irqc);
+		return -ENODEV;
+	}
+
+	set_handle_irq(aic_handle_irq);
+	set_handle_fiq(aic_handle_fiq);
+
+	for (i = 0; i < BITS_TO_U32(irqc->nr_hw); i++)
+		aic_ic_write(irqc, AIC_MASK_SET + i * 4, ~0);
+	for (i = 0; i < BITS_TO_U32(irqc->nr_hw); i++)
+		aic_ic_write(irqc, AIC_SW_CLR + i * 4, ~0);
+	for (i = 0; i < irqc->nr_hw; i++)
+		aic_ic_write(irqc, AIC_TARGET_CPU + i * 4, 1);
+
+	cpuhp_setup_state(CPUHP_AP_IRQ_APPLE_AIC_STARTING,
+			  "irqchip/apple-aic/ipi:starting",
+			  aic_init_cpu, NULL);
+
+	pr_info("AIC: initialized with %d IRQs, %d FIQs, %d vIPIs\n",
+		irqc->nr_hw, AIC_NR_FIQ, AIC_NR_SWIPI);
+
+	return 0;
+}
+
+IRQCHIP_DECLARE(apple_m1_aic, "apple,aic", aic_of_ic_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index f14adb882338..f56eee992c75 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -100,6 +100,7 @@  enum cpuhp_state {
 	CPUHP_AP_CPU_PM_STARTING,
 	CPUHP_AP_IRQ_GIC_STARTING,
 	CPUHP_AP_IRQ_HIP04_STARTING,
+	CPUHP_AP_IRQ_APPLE_AIC_STARTING,
 	CPUHP_AP_IRQ_ARMADA_XP_STARTING,
 	CPUHP_AP_IRQ_BCM2836_STARTING,
 	CPUHP_AP_IRQ_MIPS_GIC_STARTING,