diff mbox series

[v12,3/7] genirq: Add mechanism to multiplex a single HW IPI

Message ID 20221126173453.306088-4-apatel@ventanamicro.com (mailing list archive)
State Superseded
Headers show
Series RISC-V IPI Improvements | expand

Checks

Context Check Description
conchuod/tree_selection fail Guessing tree name failed

Commit Message

Anup Patel Nov. 26, 2022, 5:34 p.m. UTC
All RISC-V platforms have a single HW IPI provided by the INTC local
interrupt controller. The HW method to trigger INTC IPI can be through
external irqchip (e.g. RISC-V AIA), through platform specific device
(e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call).

To support multiple IPIs on RISC-V, we add a generic IPI multiplexing
mechanism which help us create multiple virtual IPIs using a single
HW IPI. This generic IPI multiplexing is inspired from the Apple AIC
irqchip driver and it is shared by various RISC-V irqchip drivers.

Signed-off-by: Anup Patel <apatel@ventanamicro.com>
---
 include/linux/irq.h  |   4 +
 kernel/irq/Kconfig   |   5 ++
 kernel/irq/Makefile  |   1 +
 kernel/irq/ipi-mux.c | 210 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 220 insertions(+)
 create mode 100644 kernel/irq/ipi-mux.c

Comments

Marc Zyngier Nov. 28, 2022, 10:34 a.m. UTC | #1
On Sat, 26 Nov 2022 17:34:49 +0000,
Anup Patel <apatel@ventanamicro.com> wrote:
> 
> All RISC-V platforms have a single HW IPI provided by the INTC local
> interrupt controller. The HW method to trigger INTC IPI can be through
> external irqchip (e.g. RISC-V AIA), through platform specific device
> (e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call).
> 
> To support multiple IPIs on RISC-V, we add a generic IPI multiplexing
> mechanism which help us create multiple virtual IPIs using a single
> HW IPI. This generic IPI multiplexing is inspired from the Apple AIC
> irqchip driver and it is shared by various RISC-V irqchip drivers.
> 
> Signed-off-by: Anup Patel <apatel@ventanamicro.com>
> ---
>  include/linux/irq.h  |   4 +
>  kernel/irq/Kconfig   |   5 ++
>  kernel/irq/Makefile  |   1 +
>  kernel/irq/ipi-mux.c | 210 +++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 220 insertions(+)
>  create mode 100644 kernel/irq/ipi-mux.c
> 
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index c3eb89606c2b..6024e1ee1257 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -1266,6 +1266,10 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
>  int ipi_send_single(unsigned int virq, unsigned int cpu);
>  int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
>  
> +void ipi_mux_process(void);
> +int ipi_mux_create(unsigned int nr_ipi,
> +		   void (*mux_send)(const struct cpumask *));
> +
>  #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
>  /*
>   * Registers a generic IRQ handling function as the top-level IRQ handler in
> diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> index db3d174c53d4..df17dbc54b02 100644
> --- a/kernel/irq/Kconfig
> +++ b/kernel/irq/Kconfig
> @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI
>  	depends on SMP
>  	select IRQ_DOMAIN_HIERARCHY
>  
> +# Generic IRQ IPI Mux support
> +config GENERIC_IRQ_IPI_MUX
> +	bool
> +	depends on SMP
> +
>  # Generic MSI interrupt support
>  config GENERIC_MSI_IRQ
>  	bool
> diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> index b4f53717d143..f19d3080bf11 100644
> --- a/kernel/irq/Makefile
> +++ b/kernel/irq/Makefile
> @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
>  obj-$(CONFIG_PM_SLEEP) += pm.o
>  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
>  obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
> +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
>  obj-$(CONFIG_SMP) += affinity.o
>  obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
>  obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
> diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
> new file mode 100644
> index 000000000000..366d8cd5320b
> --- /dev/null
> +++ b/kernel/irq/ipi-mux.c
> @@ -0,0 +1,210 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Multiplex several virtual IPIs over a single HW IPI.
> + *
> + * Copyright The Asahi Linux Contributors
> + * Copyright (c) 2022 Ventana Micro Systems Inc.
> + */
> +
> +#define pr_fmt(fmt) "ipi-mux: " fmt
> +#include <linux/cpu.h>
> +#include <linux/init.h>
> +#include <linux/irq.h>
> +#include <linux/irqchip.h>
> +#include <linux/irqchip/chained_irq.h>
> +#include <linux/irqdomain.h>
> +#include <linux/jump_label.h>
> +#include <linux/percpu.h>
> +#include <linux/smp.h>
> +
> +struct ipi_mux_cpu {
> +	atomic_t			enable;
> +	atomic_t			bits;
> +	struct cpumask			send_mask;
> +};
> +
> +static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
> +static struct irq_domain *ipi_mux_domain;
> +static void (*ipi_mux_send)(const struct cpumask *mask);
> +
> +static void ipi_mux_mask(struct irq_data *d)
> +{
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> +
> +	atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
> +}
> +
> +static void ipi_mux_unmask(struct irq_data *d)
> +{
> +	u32 ibit = BIT(irqd_to_hwirq(d));
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> +
> +	atomic_or(ibit, &icpu->enable);
> +
> +	/*
> +	 * The atomic_or() above must complete before the atomic_read()
> +	 * below to avoid racing ipi_mux_send_mask().
> +	 */
> +	smp_mb__after_atomic();
> +
> +	/* If a pending IPI was unmasked, raise a parent IPI immediately. */
> +	if (atomic_read(&icpu->bits) & ibit)
> +		ipi_mux_send(cpumask_of(smp_processor_id()));
> +}
> +
> +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
> +{
> +	u32 ibit = BIT(irqd_to_hwirq(d));
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> +	struct cpumask *send_mask = &icpu->send_mask;
> +	unsigned long flags;
> +	int cpu;
> +
> +	/*
> +	 * We use send_mask as a per-CPU variable so disable local
> +	 * interrupts to avoid being preempted.
> +	 */
> +	local_irq_save(flags);

The correct way to avoid preemption is to use preempt_disable(), which
is a lot cheaper than disabling interrupt on most architectures.

> +
> +	cpumask_clear(send_mask);

This thing is likely to be unnecessarily expensive on very large
systems, as it is proportional to the number of CPUs.

> +
> +	for_each_cpu(cpu, mask) {
> +		icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
> +		atomic_or(ibit, &icpu->bits);

The original code had an atomic_fetch_or_release() to allow eliding
the IPI if the target interrupt was already pending. Why is that code
gone? This is a pretty cheap and efficient optimisation.

> +
> +		/*
> +		 * The atomic_or() above must complete before
> +		 * the atomic_read() below to avoid racing with
> +		 * ipi_mux_unmask().
> +		 */
> +		smp_mb__after_atomic();
> +
> +		if (atomic_read(&icpu->enable) & ibit)
> +			cpumask_set_cpu(cpu, send_mask);
> +	}
> +
> +	/* Trigger the parent IPI */
> +	ipi_mux_send(send_mask);

IPIs are very rarely made pending on more than a single CPU at a
time. The overwhelming majority of them are targeting a single CPU. So
accumulating bits to avoid doing two or more "send" actions only
penalises the generic case.

My conclusion is that this "send_mask" can probably be removed,
together with the preemption fiddling.

> +
> +	local_irq_restore(flags);
> +}
> +
> +static const struct irq_chip ipi_mux_chip = {
> +	.name		= "IPI Mux",
> +	.irq_mask	= ipi_mux_mask,
> +	.irq_unmask	= ipi_mux_unmask,
> +	.ipi_send_mask	= ipi_mux_send_mask,
> +};

OK, you have now dropped the superfluous pre/post handlers. But the
need still exists. Case in point, the aic_handle_ipi() prologue and
epilogue to the interrupt handling. I have suggested last time that
the driver could provide the actual struct irq_chip in order to
provide the callbacks it requires.

Please realise that I will not take this patch if this cannot be made
to work with the single existing in-tree instance of an IPI MUX. 90%
of the code having been lifted from there, I think this is a pretty
fair ask.

> +
> +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
> +				unsigned int nr_irqs, void *arg)
> +{
> +	int i;
> +
> +	for (i = 0; i < nr_irqs; i++) {
> +		irq_set_percpu_devid(virq + i);
> +		irq_domain_set_info(d, virq + i, i,
> +				    &ipi_mux_chip, d->host_data,

What does d->host_data represent here?

> +				    handle_percpu_devid_irq, NULL, NULL);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct irq_domain_ops ipi_mux_domain_ops = {
> +	.alloc		= ipi_mux_domain_alloc,
> +	.free		= irq_domain_free_irqs_top,
> +};
> +
> +/**
> + * ipi_mux_process - Process multiplexed virtual IPIs
> + */
> +void ipi_mux_process(void)
> +{
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> +	irq_hw_number_t hwirq;
> +	unsigned long ipis;
> +	unsigned int en;
> +
> +	/*
> +	 * Reading enable mask does not need to be ordered as long as
> +	 * this function called from interrupt handler because only
> +	 * the CPU itself can change it's own enable mask.
> +	 */
> +	en = atomic_read(&icpu->enable);
> +
> +	/*
> +	 * Clear the IPIs we are about to handle. This pairs with the
> +	 * atomic_fetch_or_release() in ipi_mux_send_mask().
> +	 */
> +	ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
> +
> +	for_each_set_bit(hwirq, &ipis, BITS_PER_LONG)

BITS_PER_LONG...

> +		generic_handle_domain_irq(ipi_mux_domain, hwirq);
> +}
> +
> +/**
> + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
> + * parent IPI.
> + * @nr_ipi:		number of virtual IPIs to create. This should
> + *			be <= BITS_PER_TYPE(int)
> + * @mux_send:		callback to trigger parent IPI
> + *
> + * Returns first virq of the newly created virtual IPIs upon success
> + * or <=0 upon failure
> + */
> +int ipi_mux_create(unsigned int nr_ipi,
> +		   void (*mux_send)(const struct cpumask *))
> +{
> +	struct fwnode_handle *fwnode;
> +	struct irq_domain *domain;
> +	int rc;
> +
> +	if (ipi_mux_domain)
> +		return -EEXIST;
> +
> +	if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)

... vs BITS_PER_TYPE(int) ...

	M.
Anup Patel Nov. 28, 2022, 11:13 a.m. UTC | #2
On Mon, Nov 28, 2022 at 4:04 PM Marc Zyngier <maz@kernel.org> wrote:
>
> On Sat, 26 Nov 2022 17:34:49 +0000,
> Anup Patel <apatel@ventanamicro.com> wrote:
> >
> > All RISC-V platforms have a single HW IPI provided by the INTC local
> > interrupt controller. The HW method to trigger INTC IPI can be through
> > external irqchip (e.g. RISC-V AIA), through platform specific device
> > (e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call).
> >
> > To support multiple IPIs on RISC-V, we add a generic IPI multiplexing
> > mechanism which help us create multiple virtual IPIs using a single
> > HW IPI. This generic IPI multiplexing is inspired from the Apple AIC
> > irqchip driver and it is shared by various RISC-V irqchip drivers.
> >
> > Signed-off-by: Anup Patel <apatel@ventanamicro.com>
> > ---
> >  include/linux/irq.h  |   4 +
> >  kernel/irq/Kconfig   |   5 ++
> >  kernel/irq/Makefile  |   1 +
> >  kernel/irq/ipi-mux.c | 210 +++++++++++++++++++++++++++++++++++++++++++
> >  4 files changed, 220 insertions(+)
> >  create mode 100644 kernel/irq/ipi-mux.c
> >
> > diff --git a/include/linux/irq.h b/include/linux/irq.h
> > index c3eb89606c2b..6024e1ee1257 100644
> > --- a/include/linux/irq.h
> > +++ b/include/linux/irq.h
> > @@ -1266,6 +1266,10 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
> >  int ipi_send_single(unsigned int virq, unsigned int cpu);
> >  int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
> >
> > +void ipi_mux_process(void);
> > +int ipi_mux_create(unsigned int nr_ipi,
> > +                void (*mux_send)(const struct cpumask *));
> > +
> >  #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
> >  /*
> >   * Registers a generic IRQ handling function as the top-level IRQ handler in
> > diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> > index db3d174c53d4..df17dbc54b02 100644
> > --- a/kernel/irq/Kconfig
> > +++ b/kernel/irq/Kconfig
> > @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI
> >       depends on SMP
> >       select IRQ_DOMAIN_HIERARCHY
> >
> > +# Generic IRQ IPI Mux support
> > +config GENERIC_IRQ_IPI_MUX
> > +     bool
> > +     depends on SMP
> > +
> >  # Generic MSI interrupt support
> >  config GENERIC_MSI_IRQ
> >       bool
> > diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> > index b4f53717d143..f19d3080bf11 100644
> > --- a/kernel/irq/Makefile
> > +++ b/kernel/irq/Makefile
> > @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
> >  obj-$(CONFIG_PM_SLEEP) += pm.o
> >  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
> >  obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
> > +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
> >  obj-$(CONFIG_SMP) += affinity.o
> >  obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
> >  obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
> > diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
> > new file mode 100644
> > index 000000000000..366d8cd5320b
> > --- /dev/null
> > +++ b/kernel/irq/ipi-mux.c
> > @@ -0,0 +1,210 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * Multiplex several virtual IPIs over a single HW IPI.
> > + *
> > + * Copyright The Asahi Linux Contributors
> > + * Copyright (c) 2022 Ventana Micro Systems Inc.
> > + */
> > +
> > +#define pr_fmt(fmt) "ipi-mux: " fmt
> > +#include <linux/cpu.h>
> > +#include <linux/init.h>
> > +#include <linux/irq.h>
> > +#include <linux/irqchip.h>
> > +#include <linux/irqchip/chained_irq.h>
> > +#include <linux/irqdomain.h>
> > +#include <linux/jump_label.h>
> > +#include <linux/percpu.h>
> > +#include <linux/smp.h>
> > +
> > +struct ipi_mux_cpu {
> > +     atomic_t                        enable;
> > +     atomic_t                        bits;
> > +     struct cpumask                  send_mask;
> > +};
> > +
> > +static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
> > +static struct irq_domain *ipi_mux_domain;
> > +static void (*ipi_mux_send)(const struct cpumask *mask);
> > +
> > +static void ipi_mux_mask(struct irq_data *d)
> > +{
> > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > +
> > +     atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
> > +}
> > +
> > +static void ipi_mux_unmask(struct irq_data *d)
> > +{
> > +     u32 ibit = BIT(irqd_to_hwirq(d));
> > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > +
> > +     atomic_or(ibit, &icpu->enable);
> > +
> > +     /*
> > +      * The atomic_or() above must complete before the atomic_read()
> > +      * below to avoid racing ipi_mux_send_mask().
> > +      */
> > +     smp_mb__after_atomic();
> > +
> > +     /* If a pending IPI was unmasked, raise a parent IPI immediately. */
> > +     if (atomic_read(&icpu->bits) & ibit)
> > +             ipi_mux_send(cpumask_of(smp_processor_id()));
> > +}
> > +
> > +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
> > +{
> > +     u32 ibit = BIT(irqd_to_hwirq(d));
> > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > +     struct cpumask *send_mask = &icpu->send_mask;
> > +     unsigned long flags;
> > +     int cpu;
> > +
> > +     /*
> > +      * We use send_mask as a per-CPU variable so disable local
> > +      * interrupts to avoid being preempted.
> > +      */
> > +     local_irq_save(flags);
>
> The correct way to avoid preemption is to use preempt_disable(), which
> is a lot cheaper than disabling interrupt on most architectures.

Okay, I will update.

>
> > +
> > +     cpumask_clear(send_mask);
>
> This thing is likely to be unnecessarily expensive on very large
> systems, as it is proportional to the number of CPUs.
>
> > +
> > +     for_each_cpu(cpu, mask) {
> > +             icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
> > +             atomic_or(ibit, &icpu->bits);
>
> The original code had an atomic_fetch_or_release() to allow eliding
> the IPI if the target interrupt was already pending. Why is that code
> gone? This is a pretty cheap and efficient optimisation.

That optimization is causing RCU stalls on QEMU RISC-V virt
machine with large number of CPUs.

>
> > +
> > +             /*
> > +              * The atomic_or() above must complete before
> > +              * the atomic_read() below to avoid racing with
> > +              * ipi_mux_unmask().
> > +              */
> > +             smp_mb__after_atomic();
> > +
> > +             if (atomic_read(&icpu->enable) & ibit)
> > +                     cpumask_set_cpu(cpu, send_mask);
> > +     }
> > +
> > +     /* Trigger the parent IPI */
> > +     ipi_mux_send(send_mask);
>
> IPIs are very rarely made pending on more than a single CPU at a
> time. The overwhelming majority of them are targeting a single CPU. So
> accumulating bits to avoid doing two or more "send" actions only
> penalises the generic case.
>
> My conclusion is that this "send_mask" can probably be removed,
> together with the preemption fiddling.

So, we should call ipi_mux_send() for one target CPU at a time ?

>
> > +
> > +     local_irq_restore(flags);
> > +}
> > +
> > +static const struct irq_chip ipi_mux_chip = {
> > +     .name           = "IPI Mux",
> > +     .irq_mask       = ipi_mux_mask,
> > +     .irq_unmask     = ipi_mux_unmask,
> > +     .ipi_send_mask  = ipi_mux_send_mask,
> > +};
>
> OK, you have now dropped the superfluous pre/post handlers. But the
> need still exists. Case in point, the aic_handle_ipi() prologue and
> epilogue to the interrupt handling. I have suggested last time that
> the driver could provide the actual struct irq_chip in order to
> provide the callbacks it requires.

The aic_handle_ipi() can simply call ipi_mux_process() between
the prologue and epilogue.

>
> Please realise that I will not take this patch if this cannot be made
> to work with the single existing in-tree instance of an IPI MUX. 90%
> of the code having been lifted from there, I think this is a pretty
> fair ask.

Only the muxing part of AIC has been factored out. All the register
programming of AIC will remain in the AIA irqchip driver without any
change in sequence.

>
> > +
> > +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
> > +                             unsigned int nr_irqs, void *arg)
> > +{
> > +     int i;
> > +
> > +     for (i = 0; i < nr_irqs; i++) {
> > +             irq_set_percpu_devid(virq + i);
> > +             irq_domain_set_info(d, virq + i, i,
> > +                                 &ipi_mux_chip, d->host_data,
>
> What does d->host_data represent here?

It's always NULL so we don't need to pass it. I will update.

>
> > +                                 handle_percpu_devid_irq, NULL, NULL);
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +static const struct irq_domain_ops ipi_mux_domain_ops = {
> > +     .alloc          = ipi_mux_domain_alloc,
> > +     .free           = irq_domain_free_irqs_top,
> > +};
> > +
> > +/**
> > + * ipi_mux_process - Process multiplexed virtual IPIs
> > + */
> > +void ipi_mux_process(void)
> > +{
> > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > +     irq_hw_number_t hwirq;
> > +     unsigned long ipis;
> > +     unsigned int en;
> > +
> > +     /*
> > +      * Reading enable mask does not need to be ordered as long as
> > +      * this function called from interrupt handler because only
> > +      * the CPU itself can change it's own enable mask.
> > +      */
> > +     en = atomic_read(&icpu->enable);
> > +
> > +     /*
> > +      * Clear the IPIs we are about to handle. This pairs with the
> > +      * atomic_fetch_or_release() in ipi_mux_send_mask().
> > +      */
> > +     ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
> > +
> > +     for_each_set_bit(hwirq, &ipis, BITS_PER_LONG)
>
> BITS_PER_LONG...

Argh, I should have used BITS_PER_TYPE(int) here. I will update.

>
> > +             generic_handle_domain_irq(ipi_mux_domain, hwirq);
> > +}
> > +
> > +/**
> > + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
> > + * parent IPI.
> > + * @nr_ipi:          number of virtual IPIs to create. This should
> > + *                   be <= BITS_PER_TYPE(int)
> > + * @mux_send:                callback to trigger parent IPI
> > + *
> > + * Returns first virq of the newly created virtual IPIs upon success
> > + * or <=0 upon failure
> > + */
> > +int ipi_mux_create(unsigned int nr_ipi,
> > +                void (*mux_send)(const struct cpumask *))
> > +{
> > +     struct fwnode_handle *fwnode;
> > +     struct irq_domain *domain;
> > +     int rc;
> > +
> > +     if (ipi_mux_domain)
> > +             return -EEXIST;
> > +
> > +     if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
>
> ... vs BITS_PER_TYPE(int) ...
>
>         M.
>
> --
> Without deviation from the norm, progress is not possible.

Regards,
Anup
Marc Zyngier Nov. 28, 2022, 11:30 a.m. UTC | #3
On Mon, 28 Nov 2022 11:13:30 +0000,
Anup Patel <apatel@ventanamicro.com> wrote:
> 
> On Mon, Nov 28, 2022 at 4:04 PM Marc Zyngier <maz@kernel.org> wrote:
> >
> > On Sat, 26 Nov 2022 17:34:49 +0000,
> > Anup Patel <apatel@ventanamicro.com> wrote:
> > >
> > > +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
> > > +{
> > > +     u32 ibit = BIT(irqd_to_hwirq(d));
> > > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > > +     struct cpumask *send_mask = &icpu->send_mask;
> > > +     unsigned long flags;
> > > +     int cpu;
> > > +
> > > +     /*
> > > +      * We use send_mask as a per-CPU variable so disable local
> > > +      * interrupts to avoid being preempted.
> > > +      */
> > > +     local_irq_save(flags);
> >
> > The correct way to avoid preemption is to use preempt_disable(), which
> > is a lot cheaper than disabling interrupt on most architectures.
> 
> Okay, I will update.
> 
> >
> > > +
> > > +     cpumask_clear(send_mask);
> >
> > This thing is likely to be unnecessarily expensive on very large
> > systems, as it is proportional to the number of CPUs.
> >
> > > +
> > > +     for_each_cpu(cpu, mask) {
> > > +             icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
> > > +             atomic_or(ibit, &icpu->bits);
> >
> > The original code had an atomic_fetch_or_release() to allow eliding
> > the IPI if the target interrupt was already pending. Why is that code
> > gone? This is a pretty cheap and efficient optimisation.
> 
> That optimization is causing RCU stalls on QEMU RISC-V virt
> machine with large number of CPUs.

Then there is a bug somewhere, either in the implementation of the
atomic operations or in QEMU. Or maybe even in the original code
(though this looks unlikely given how heavily this is used on actual
HW - I'm typing this email from one of these machines, and I'd be
pretty annoyed if I was missing IPIs).

In any case, please don't paper over this.

> 
> >
> > > +
> > > +             /*
> > > +              * The atomic_or() above must complete before
> > > +              * the atomic_read() below to avoid racing with
> > > +              * ipi_mux_unmask().
> > > +              */
> > > +             smp_mb__after_atomic();
> > > +
> > > +             if (atomic_read(&icpu->enable) & ibit)
> > > +                     cpumask_set_cpu(cpu, send_mask);
> > > +     }
> > > +
> > > +     /* Trigger the parent IPI */
> > > +     ipi_mux_send(send_mask);
> >
> > IPIs are very rarely made pending on more than a single CPU at a
> > time. The overwhelming majority of them are targeting a single CPU. So
> > accumulating bits to avoid doing two or more "send" actions only
> > penalises the generic case.
> >
> > My conclusion is that this "send_mask" can probably be removed,
> > together with the preemption fiddling.
> 
> So, we should call ipi_mux_send() for one target CPU at a time ?

I think so, as it matches my measurements from a few years ago. It
also simplifies things significantly, leading to better performance
for the common case. Add some instrumentation and see whether this is
still the case though.

> 
> >
> > > +
> > > +     local_irq_restore(flags);
> > > +}
> > > +
> > > +static const struct irq_chip ipi_mux_chip = {
> > > +     .name           = "IPI Mux",
> > > +     .irq_mask       = ipi_mux_mask,
> > > +     .irq_unmask     = ipi_mux_unmask,
> > > +     .ipi_send_mask  = ipi_mux_send_mask,
> > > +};
> >
> > OK, you have now dropped the superfluous pre/post handlers. But the
> > need still exists. Case in point, the aic_handle_ipi() prologue and
> > epilogue to the interrupt handling. I have suggested last time that
> > the driver could provide the actual struct irq_chip in order to
> > provide the callbacks it requires.
> 
> The aic_handle_ipi() can simply call ipi_mux_process() between
> the prologue and epilogue.

Hmm. OK. That's not what I had in mind, but fair enough.

	M.
Anup Patel Nov. 29, 2022, 2:15 p.m. UTC | #4
On Mon, Nov 28, 2022 at 5:00 PM Marc Zyngier <maz@kernel.org> wrote:
>
> On Mon, 28 Nov 2022 11:13:30 +0000,
> Anup Patel <apatel@ventanamicro.com> wrote:
> >
> > On Mon, Nov 28, 2022 at 4:04 PM Marc Zyngier <maz@kernel.org> wrote:
> > >
> > > On Sat, 26 Nov 2022 17:34:49 +0000,
> > > Anup Patel <apatel@ventanamicro.com> wrote:
> > > >
> > > > +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
> > > > +{
> > > > +     u32 ibit = BIT(irqd_to_hwirq(d));
> > > > +     struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> > > > +     struct cpumask *send_mask = &icpu->send_mask;
> > > > +     unsigned long flags;
> > > > +     int cpu;
> > > > +
> > > > +     /*
> > > > +      * We use send_mask as a per-CPU variable so disable local
> > > > +      * interrupts to avoid being preempted.
> > > > +      */
> > > > +     local_irq_save(flags);
> > >
> > > The correct way to avoid preemption is to use preempt_disable(), which
> > > is a lot cheaper than disabling interrupt on most architectures.
> >
> > Okay, I will update.
> >
> > >
> > > > +
> > > > +     cpumask_clear(send_mask);
> > >
> > > This thing is likely to be unnecessarily expensive on very large
> > > systems, as it is proportional to the number of CPUs.
> > >
> > > > +
> > > > +     for_each_cpu(cpu, mask) {
> > > > +             icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
> > > > +             atomic_or(ibit, &icpu->bits);
> > >
> > > The original code had an atomic_fetch_or_release() to allow eliding
> > > the IPI if the target interrupt was already pending. Why is that code
> > > gone? This is a pretty cheap and efficient optimisation.
> >
> > That optimization is causing RCU stalls on QEMU RISC-V virt
> > machine with large number of CPUs.
>
> Then there is a bug somewhere, either in the implementation of the
> atomic operations or in QEMU. Or maybe even in the original code
> (though this looks unlikely given how heavily this is used on actual
> HW - I'm typing this email from one of these machines, and I'd be
> pretty annoyed if I was missing IPIs).
>
> In any case, please don't paper over this.

I was trying to defer the optimization to a later stage until this
issue was fixed for RISC-V.

Anyways, I found the root cause. This turned out to be missing
broadcast timer initialization in time_init() for RISC-V. Removing
the optimization over here was simply hiding the issue.

I will bring back the optimization in the next patch revision.

>
> >
> > >
> > > > +
> > > > +             /*
> > > > +              * The atomic_or() above must complete before
> > > > +              * the atomic_read() below to avoid racing with
> > > > +              * ipi_mux_unmask().
> > > > +              */
> > > > +             smp_mb__after_atomic();
> > > > +
> > > > +             if (atomic_read(&icpu->enable) & ibit)
> > > > +                     cpumask_set_cpu(cpu, send_mask);
> > > > +     }
> > > > +
> > > > +     /* Trigger the parent IPI */
> > > > +     ipi_mux_send(send_mask);
> > >
> > > IPIs are very rarely made pending on more than a single CPU at a
> > > time. The overwhelming majority of them are targeting a single CPU. So
> > > accumulating bits to avoid doing two or more "send" actions only
> > > penalises the generic case.
> > >
> > > My conclusion is that this "send_mask" can probably be removed,
> > > together with the preemption fiddling.
> >
> > So, we should call ipi_mux_send() for one target CPU at a time ?
>
> I think so, as it matches my measurements from a few years ago. It
> also simplifies things significantly, leading to better performance
> for the common case. Add some instrumentation and see whether this is
> still the case though.

I did not see any difference in the hackbench running on QEMU RISC-V.
I will simplify ipi_mux_send() like you suggested.

>
> >
> > >
> > > > +
> > > > +     local_irq_restore(flags);
> > > > +}
> > > > +
> > > > +static const struct irq_chip ipi_mux_chip = {
> > > > +     .name           = "IPI Mux",
> > > > +     .irq_mask       = ipi_mux_mask,
> > > > +     .irq_unmask     = ipi_mux_unmask,
> > > > +     .ipi_send_mask  = ipi_mux_send_mask,
> > > > +};
> > >
> > > OK, you have now dropped the superfluous pre/post handlers. But the
> > > need still exists. Case in point, the aic_handle_ipi() prologue and
> > > epilogue to the interrupt handling. I have suggested last time that
> > > the driver could provide the actual struct irq_chip in order to
> > > provide the callbacks it requires.
> >
> > The aic_handle_ipi() can simply call ipi_mux_process() between
> > the prologue and epilogue.
>
> Hmm. OK. That's not what I had in mind, but fair enough.
>
>         M.
>
> --
> Without deviation from the norm, progress is not possible.

Regards,
Anup
diff mbox series

Patch

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c3eb89606c2b..6024e1ee1257 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -1266,6 +1266,10 @@  int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
 int ipi_send_single(unsigned int virq, unsigned int cpu);
 int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
 
+void ipi_mux_process(void);
+int ipi_mux_create(unsigned int nr_ipi,
+		   void (*mux_send)(const struct cpumask *));
+
 #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
 /*
  * Registers a generic IRQ handling function as the top-level IRQ handler in
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index db3d174c53d4..df17dbc54b02 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -86,6 +86,11 @@  config GENERIC_IRQ_IPI
 	depends on SMP
 	select IRQ_DOMAIN_HIERARCHY
 
+# Generic IRQ IPI Mux support
+config GENERIC_IRQ_IPI_MUX
+	bool
+	depends on SMP
+
 # Generic MSI interrupt support
 config GENERIC_MSI_IRQ
 	bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index b4f53717d143..f19d3080bf11 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -15,6 +15,7 @@  obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
 obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
 obj-$(CONFIG_SMP) += affinity.o
 obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
 obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
new file mode 100644
index 000000000000..366d8cd5320b
--- /dev/null
+++ b/kernel/irq/ipi-mux.c
@@ -0,0 +1,210 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Multiplex several virtual IPIs over a single HW IPI.
+ *
+ * Copyright The Asahi Linux Contributors
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "ipi-mux: " fmt
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct ipi_mux_cpu {
+	atomic_t			enable;
+	atomic_t			bits;
+	struct cpumask			send_mask;
+};
+
+static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
+static struct irq_domain *ipi_mux_domain;
+static void (*ipi_mux_send)(const struct cpumask *mask);
+
+static void ipi_mux_mask(struct irq_data *d)
+{
+	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+
+	atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
+}
+
+static void ipi_mux_unmask(struct irq_data *d)
+{
+	u32 ibit = BIT(irqd_to_hwirq(d));
+	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+
+	atomic_or(ibit, &icpu->enable);
+
+	/*
+	 * The atomic_or() above must complete before the atomic_read()
+	 * below to avoid racing ipi_mux_send_mask().
+	 */
+	smp_mb__after_atomic();
+
+	/* If a pending IPI was unmasked, raise a parent IPI immediately. */
+	if (atomic_read(&icpu->bits) & ibit)
+		ipi_mux_send(cpumask_of(smp_processor_id()));
+}
+
+static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+	u32 ibit = BIT(irqd_to_hwirq(d));
+	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+	struct cpumask *send_mask = &icpu->send_mask;
+	unsigned long flags;
+	int cpu;
+
+	/*
+	 * We use send_mask as a per-CPU variable so disable local
+	 * interrupts to avoid being preempted.
+	 */
+	local_irq_save(flags);
+
+	cpumask_clear(send_mask);
+
+	for_each_cpu(cpu, mask) {
+		icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
+		atomic_or(ibit, &icpu->bits);
+
+		/*
+		 * The atomic_or() above must complete before
+		 * the atomic_read() below to avoid racing with
+		 * ipi_mux_unmask().
+		 */
+		smp_mb__after_atomic();
+
+		if (atomic_read(&icpu->enable) & ibit)
+			cpumask_set_cpu(cpu, send_mask);
+	}
+
+	/* Trigger the parent IPI */
+	ipi_mux_send(send_mask);
+
+	local_irq_restore(flags);
+}
+
+static const struct irq_chip ipi_mux_chip = {
+	.name		= "IPI Mux",
+	.irq_mask	= ipi_mux_mask,
+	.irq_unmask	= ipi_mux_unmask,
+	.ipi_send_mask	= ipi_mux_send_mask,
+};
+
+static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
+				unsigned int nr_irqs, void *arg)
+{
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_set_percpu_devid(virq + i);
+		irq_domain_set_info(d, virq + i, i,
+				    &ipi_mux_chip, d->host_data,
+				    handle_percpu_devid_irq, NULL, NULL);
+	}
+
+	return 0;
+}
+
+static const struct irq_domain_ops ipi_mux_domain_ops = {
+	.alloc		= ipi_mux_domain_alloc,
+	.free		= irq_domain_free_irqs_top,
+};
+
+/**
+ * ipi_mux_process - Process multiplexed virtual IPIs
+ */
+void ipi_mux_process(void)
+{
+	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+	irq_hw_number_t hwirq;
+	unsigned long ipis;
+	unsigned int en;
+
+	/*
+	 * Reading enable mask does not need to be ordered as long as
+	 * this function called from interrupt handler because only
+	 * the CPU itself can change it's own enable mask.
+	 */
+	en = atomic_read(&icpu->enable);
+
+	/*
+	 * Clear the IPIs we are about to handle. This pairs with the
+	 * atomic_fetch_or_release() in ipi_mux_send_mask().
+	 */
+	ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
+
+	for_each_set_bit(hwirq, &ipis, BITS_PER_LONG)
+		generic_handle_domain_irq(ipi_mux_domain, hwirq);
+}
+
+/**
+ * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
+ * parent IPI.
+ * @nr_ipi:		number of virtual IPIs to create. This should
+ *			be <= BITS_PER_TYPE(int)
+ * @mux_send:		callback to trigger parent IPI
+ *
+ * Returns first virq of the newly created virtual IPIs upon success
+ * or <=0 upon failure
+ */
+int ipi_mux_create(unsigned int nr_ipi,
+		   void (*mux_send)(const struct cpumask *))
+{
+	struct fwnode_handle *fwnode;
+	struct irq_domain *domain;
+	int rc;
+
+	if (ipi_mux_domain)
+		return -EEXIST;
+
+	if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
+		return -EINVAL;
+
+	ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu));
+	if (!ipi_mux_pcpu)
+		return -ENOMEM;
+
+	fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
+	if (!fwnode) {
+		pr_err("unable to create IPI Mux fwnode\n");
+		rc = -ENOMEM;
+		goto fail_free_cpu;
+	}
+
+	domain = irq_domain_create_simple(fwnode, nr_ipi, 0,
+					  &ipi_mux_domain_ops, NULL);
+	if (!domain) {
+		pr_err("unable to add IPI Mux domain\n");
+		rc = -ENOMEM;
+		goto fail_free_fwnode;
+	}
+
+	domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+	irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI);
+
+	rc = __irq_domain_alloc_irqs(domain, -1, nr_ipi,
+				     NUMA_NO_NODE, NULL, false, NULL);
+	if (rc <= 0) {
+		pr_err("unable to alloc IRQs from IPI Mux domain\n");
+		goto fail_free_domain;
+	}
+
+	ipi_mux_domain = domain;
+	ipi_mux_send = mux_send;
+
+	return rc;
+
+fail_free_domain:
+	irq_domain_remove(domain);
+fail_free_fwnode:
+	irq_domain_free_fwnode(fwnode);
+fail_free_cpu:
+	free_percpu(ipi_mux_pcpu);
+	return rc;
+}