diff mbox

[v3,-tip,1/5] x86, MSI: Support multiple MSIs in presense of IRQ remapping

Message ID b15a9b5242c7de5770d2315ea8d13f4dcf63c00b.1349074231.git.agordeev@redhat.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Alexander Gordeev Oct. 1, 2012, 8:09 a.m. UTC
The MSI specification has several constraints in comparison with MSI-X,
most notable of them is the inability to configure MSIs independently.
As a result, it is impossible to dispatch interrupts from different
queues to different CPUs. This is largely devalues the support of
multiple MSIs in SMP systems.

Also, a necessity to allocate a contiguous block of vector numbers for
devices capable of multiple MSIs might cause a considerable pressure on
x86 interrupt vector allocator and could lead to fragmentation of the
interrupt vectors space.

This patch overcomes both drawbacks in presense of IRQ remapping and
lets devices take advantage of multiple queues and per-IRQ affinity
assignments.

Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
---
 arch/x86/kernel/apic/io_apic.c |  174 +++++++++++++++++++++++++++++++++------
 include/linux/irq.h            |    6 ++
 kernel/irq/chip.c              |   30 +++++--
 kernel/irq/irqdesc.c           |   31 +++++++
 4 files changed, 206 insertions(+), 35 deletions(-)

Comments

Ingo Molnar Oct. 2, 2012, 4:55 a.m. UTC | #1
* Alexander Gordeev <agordeev@redhat.com> wrote:

> The MSI specification has several constraints in comparison with MSI-X,
> most notable of them is the inability to configure MSIs independently.
> As a result, it is impossible to dispatch interrupts from different
> queues to different CPUs. This is largely devalues the support of
> multiple MSIs in SMP systems.
> 
> Also, a necessity to allocate a contiguous block of vector numbers for
> devices capable of multiple MSIs might cause a considerable pressure on
> x86 interrupt vector allocator and could lead to fragmentation of the
> interrupt vectors space.
> 
> This patch overcomes both drawbacks in presense of IRQ remapping and
> lets devices take advantage of multiple queues and per-IRQ affinity
> assignments.
> 
> Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
> ---
>  arch/x86/kernel/apic/io_apic.c |  174 +++++++++++++++++++++++++++++++++------
>  include/linux/irq.h            |    6 ++
>  kernel/irq/chip.c              |   30 +++++--
>  kernel/irq/irqdesc.c           |   31 +++++++
>  4 files changed, 206 insertions(+), 35 deletions(-)
> 
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index c265593..d5cb13c 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -305,6 +305,11 @@ static int alloc_irq_from(unsigned int from, int node)
>  	return irq_alloc_desc_from(from, node);
>  }
>  
> +static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
> +{
> +	return irq_alloc_descs_from(from, count, node);
> +}
> +
>  static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
>  {
>  	free_irq_cfg(at, cfg);
> @@ -2991,37 +2996,58 @@ device_initcall(ioapic_init_ops);
>  /*
>   * Dynamic irq allocate and deallocation
>   */
> -unsigned int create_irq_nr(unsigned int from, int node)
> +unsigned int __create_irqs(unsigned int from, unsigned int count, int node)
>  {
> -	struct irq_cfg *cfg;
> +	struct irq_cfg **cfg;
>  	unsigned long flags;
> -	unsigned int ret = 0;
> -	int irq;
> +	int irq, i;
>  
>  	if (from < nr_irqs_gsi)
>  		from = nr_irqs_gsi;
>  
> -	irq = alloc_irq_from(from, node);
> -	if (irq < 0)
> -		return 0;
> -	cfg = alloc_irq_cfg(irq, node);
> -	if (!cfg) {
> -		free_irq_at(irq, NULL);
> +	cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node);
> +	if (!cfg)
>  		return 0;
> +
> +	irq = alloc_irqs_from(from, count, node);
> +	if (irq < 0)
> +		goto out_cfgs;
> +
> +	for (i = 0; i < count; i++) {
> +		cfg[i] = alloc_irq_cfg(irq + i, node);
> +		if (!cfg[i])
> +			goto out_irqs;
>  	}
>  
>  	raw_spin_lock_irqsave(&vector_lock, flags);
> -	if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
> -		ret = irq;
> +	for (i = 0; i < count; i++)
> +		if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus()))
> +			goto out_vecs;
>  	raw_spin_unlock_irqrestore(&vector_lock, flags);
>  
> -	if (ret) {
> -		irq_set_chip_data(irq, cfg);
> -		irq_clear_status_flags(irq, IRQ_NOREQUEST);
> -	} else {
> -		free_irq_at(irq, cfg);
> +	for (i = 0; i < count; i++) {
> +		irq_set_chip_data(irq + i, cfg[i]);
> +		irq_clear_status_flags(irq + i, IRQ_NOREQUEST);
>  	}
> -	return ret;
> +
> +	kfree(cfg);
> +	return irq;
> +
> +out_vecs:
> +	for (; i; i--)
> +		__clear_irq_vector(irq + i - 1, cfg[i - 1]);
> +	raw_spin_unlock_irqrestore(&vector_lock, flags);
> +out_irqs:
> +	for (i = 0; i < count; i++)
> +		free_irq_at(irq + i, cfg[i]);
> +out_cfgs:
> +	kfree(cfg);
> +	return 0;
> +}
> +
> +unsigned int create_irq_nr(unsigned int from, int node)
> +{
> +	return __create_irqs(from, 1, node);
>  }
>  
>  int create_irq(void)
> @@ -3054,6 +3080,27 @@ void destroy_irq(unsigned int irq)
>  	free_irq_at(irq, cfg);
>  }
>  
> +static inline void destroy_irqs(unsigned int irq, unsigned int count)
> +{
> +	unsigned int i;
> +	for (i = 0; i < count; i++)

Missing newline.

> +		destroy_irq(irq + i);
> +}
> +
> +static inline int
> +can_create_pow_of_two_irqs(unsigned int from, unsigned int count)
> +{
> +	if ((count > 1) && (count % 2))
> +		return -EINVAL;
> +
> +	for (; count; count = count / 2) {
> +		if (!irq_can_alloc_irqs(from, count))
> +			return count;
> +	}
> +
> +	return -ENOSPC;
> +}
> +
>  /*
>   * MSI message composition
>   */
> @@ -3145,18 +3192,25 @@ static struct irq_chip msi_chip = {
>  	.irq_retrigger		= ioapic_retrigger_irq,
>  };
>  
> -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
> +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
> +			 unsigned int irq_base, unsigned int irq_offset)
>  {
>  	struct irq_chip *chip = &msi_chip;
>  	struct msi_msg msg;
> +	unsigned int irq = irq_base + irq_offset;
>  	int ret;
>  
>  	ret = msi_compose_msg(dev, irq, &msg, -1);
>  	if (ret < 0)
>  		return ret;
>  
> -	irq_set_msi_desc(irq, msidesc);
> -	write_msi_msg(irq, &msg);
> +	irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
> +
> +	/* MSI-X message is written per-IRQ, the offset is always 0.
> +	 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
> +	 */

Please use the customary (multi-line) comment style:

  /*
   * Comment .....
   * ...... goes here.
   */

specified in Documentation/CodingStyle.


> +	if (!irq_offset)
> +		write_msi_msg(irq, &msg);
>  
>  	if (irq_remapped(irq_get_chip_data(irq))) {
>  		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
> @@ -3170,16 +3224,12 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
>  	return 0;
>  }
>  
> -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> +int setup_msix_irqs(struct pci_dev *dev, int nvec)
>  {
>  	int node, ret, sub_handle, index = 0;
>  	unsigned int irq, irq_want;
>  	struct msi_desc *msidesc;
>  
> -	/* x86 doesn't support multiple MSI yet */
> -	if (type == PCI_CAP_ID_MSI && nvec > 1)
> -		return 1;
> -
>  	node = dev_to_node(&dev->dev);
>  	irq_want = nr_irqs_gsi;
>  	sub_handle = 0;
> @@ -3208,7 +3258,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
>  				goto error;
>  		}
>  no_ir:
> -		ret = setup_msi_irq(dev, msidesc, irq);
> +		ret = setup_msi_irq(dev, msidesc, irq, 0);
>  		if (ret < 0)
>  			goto error;
>  		sub_handle++;
> @@ -3220,6 +3270,76 @@ error:
>  	return ret;
>  }
>  
> +int setup_msi_irqs(struct pci_dev *dev, int nvec)
> +{
> +	int node, ret, sub_handle, index = 0;
> +	unsigned int irq;
> +	struct msi_desc *msidesc;
> +
> +	if (nvec > 1 && !irq_remapping_enabled)
> +		return 1;
> +
> +	nvec = __roundup_pow_of_two(nvec);
> +	ret = can_create_pow_of_two_irqs(nr_irqs_gsi, nvec);
> +	if (ret != nvec)
> +		return ret;
> +
> +	WARN_ON(!list_is_singular(&dev->msi_list));
> +	msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
> +	WARN_ON(msidesc->irq);
> +	WARN_ON(msidesc->msi_attrib.multiple);
> +
> +	node = dev_to_node(&dev->dev);
> +	irq = __create_irqs(nr_irqs_gsi, nvec, node);
> +	if (irq == 0)
> +		return -ENOSPC;
> +
> +	if (!irq_remapping_enabled) {
> +		ret = setup_msi_irq(dev, msidesc, irq, 0);
> +		if (ret < 0)
> +			goto error;
> +		return 0;
> +	}
> +
> +	msidesc->msi_attrib.multiple = ilog2(nvec);
> +	for (sub_handle = 0; sub_handle < nvec; sub_handle++) {
> +		if (!sub_handle) {
> +			index = msi_alloc_remapped_irq(dev, irq, nvec);
> +			if (index < 0) {
> +				ret = index;
> +				goto error;
> +			}
> +		} else {
> +			ret = msi_setup_remapped_irq(dev, irq + sub_handle,
> +						     index, sub_handle);
> +			if (ret < 0)
> +				goto error;
> +		}
> +		ret = setup_msi_irq(dev, msidesc, irq, sub_handle);
> +		if (ret < 0)
> +			goto error;
> +	}
> +	return 0;
> +
> +error:
> +	destroy_irqs(irq, nvec);
> +
> +	/* Restore altered MSI descriptor fields and prevent just destroyed
> +	 * IRQs from tearing down again in default_teardown_msi_irqs()
> +	 */

Ditto.

> +	msidesc->irq = 0;
> +	msidesc->msi_attrib.multiple = 0;
> +
> +	return ret;
> +}
> +
> +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> +{
> +	if (type == PCI_CAP_ID_MSI)
> +		return setup_msi_irqs(dev, nvec);
> +	return setup_msix_irqs(dev, nvec);
> +}
> +
>  void native_teardown_msi_irq(unsigned int irq)
>  {
>  	destroy_irq(irq);
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index 216b0ba..c3ba39f 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -522,6 +522,8 @@ extern int irq_set_handler_data(unsigned int irq, void *data);
>  extern int irq_set_chip_data(unsigned int irq, void *data);
>  extern int irq_set_irq_type(unsigned int irq, unsigned int type);
>  extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry);
> +extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
> +				struct msi_desc *entry);
>  extern struct irq_data *irq_get_irq_data(unsigned int irq);
>  
>  static inline struct irq_chip *irq_get_chip(unsigned int irq)
> @@ -584,8 +586,12 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
>  #define irq_alloc_desc_from(from, node)		\
>  	irq_alloc_descs(-1, from, 1, node)
>  
> +#define irq_alloc_descs_from(from, cnt, node)	\
> +	irq_alloc_descs(-1, from, cnt, node)
> +

Please use inlines instead of macros. Might transform the one 
above it as well in the process.

>  void irq_free_descs(unsigned int irq, unsigned int cnt);
>  int irq_reserve_irqs(unsigned int from, unsigned int cnt);
> +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt);
>  
>  static inline void irq_free_desc(unsigned int irq)
>  {
> diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
> index 57d86d0..2230389 100644
> --- a/kernel/irq/chip.c
> +++ b/kernel/irq/chip.c
> @@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
>  EXPORT_SYMBOL(irq_set_handler_data);
>  
>  /**
> - *	irq_set_msi_desc - set MSI descriptor data for an irq
> - *	@irq:	Interrupt number
> - *	@entry:	Pointer to MSI descriptor data
> + *	irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
> + *	@irq_base:	Interrupt number base
> + *	@irq_offset:	Interrupt number offset
> + *	@entry:		Pointer to MSI descriptor data
>   *
> - *	Set the MSI descriptor entry for an irq
> + *	Set the MSI descriptor entry for an irq at offset
>   */
> -int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
> +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
> +			 struct msi_desc *entry)
>  {
>  	unsigned long flags;
> -	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
> +	struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
>  
>  	if (!desc)
>  		return -EINVAL;
>  	desc->irq_data.msi_desc = entry;
> -	if (entry)
> -		entry->irq = irq;
> +	if (entry && !irq_offset)
> +		entry->irq = irq_base;
>  	irq_put_desc_unlock(desc, flags);
>  	return 0;
>  }
>  
>  /**
> + *	irq_set_msi_desc - set MSI descriptor data for an irq
> + *	@irq:	Interrupt number
> + *	@entry:	Pointer to MSI descriptor data
> + *
> + *	Set the MSI descriptor entry for an irq
> + */
> +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
> +{
> +	return irq_set_msi_desc_off(irq, 0, entry);
> +}
> +
> +/**
>   *	irq_set_chip_data - set irq chip data for an irq
>   *	@irq:	Interrupt number
>   *	@data:	Pointer to chip specific data
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 192a302..8287b78 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -210,6 +210,13 @@ static int irq_expand_nr_irqs(unsigned int nr)
>  	return 0;
>  }
>  
> +static int irq_can_expand_nr_irqs(unsigned int nr)
> +{
> +	if (nr > IRQ_BITMAP_BITS)
> +		return -ENOMEM;
> +	return 0;
> +}
> +
>  int __init early_irq_init(void)
>  {
>  	int i, initcnt, node = first_online_node;
> @@ -414,6 +421,30 @@ int irq_reserve_irqs(unsigned int from, unsigned int cnt)
>  }
>  
>  /**
> + * irq_can_alloc_irqs - checks if a range of irqs could be allocated
> + * @from:	check from irq number
> + * @cnt:	number of irqs to check
> + *
> + * Returns 0 on success or an appropriate error code
> + */
> +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt)
> +{
> +	unsigned int start;
> +	int ret = 0;
> +
> +	if (!cnt)
> +		return -EINVAL;
> +
> +	mutex_lock(&sparse_irq_lock);
> +	start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
> +					   from, cnt, 0);
> +	mutex_unlock(&sparse_irq_lock);
> +	if (start + cnt > nr_irqs)
> +		ret = irq_can_expand_nr_irqs(start + cnt);
> +	return ret;

How is this supposed to work wrt. races?

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Gordeev Oct. 2, 2012, 11:06 a.m. UTC | #2
On Tue, Oct 02, 2012 at 06:55:18AM +0200, Ingo Molnar wrote:

Thanks for the review, Ingo.

> > @@ -584,8 +586,12 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
> >  #define irq_alloc_desc_from(from, node)		\
> >  	irq_alloc_descs(-1, from, 1, node)
> >  
> > +#define irq_alloc_descs_from(from, cnt, node)	\
> > +	irq_alloc_descs(-1, from, cnt, node)
> > +
> 
> Please use inlines instead of macros. Might transform the one 
> above it as well in the process.

You mean here do not introduce irq_alloc_descs_from, but rather use
irq_alloc_descs() directly?

> > +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt)
> > +{
> > +	unsigned int start;
> > +	int ret = 0;
> > +
> > +	if (!cnt)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&sparse_irq_lock);
> > +	start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
> > +					   from, cnt, 0);
> > +	mutex_unlock(&sparse_irq_lock);
> > +	if (start + cnt > nr_irqs)
> > +		ret = irq_can_expand_nr_irqs(start + cnt);
> > +	return ret;
> 
> How is this supposed to work wrt. races?

It is not supposed. Just a quick check if there are enough bits before an
attempt to allocate memory in __create_irqs(). Otherwise __create_irqs()
might allocate irq_cfg's, then realize there are no bits, then deallocate
and fail.

But strictly speaking, irq_can_alloc_irqs() is unnecessary.
Ingo Molnar Oct. 2, 2012, 11:25 a.m. UTC | #3
* Alexander Gordeev <agordeev@redhat.com> wrote:

> On Tue, Oct 02, 2012 at 06:55:18AM +0200, Ingo Molnar wrote:
> 
> Thanks for the review, Ingo.
> 
> > > @@ -584,8 +586,12 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
> > >  #define irq_alloc_desc_from(from, node)		\
> > >  	irq_alloc_descs(-1, from, 1, node)
> > >  
> > > +#define irq_alloc_descs_from(from, cnt, node)	\
> > > +	irq_alloc_descs(-1, from, cnt, node)
> > > +
> > 
> > Please use inlines instead of macros. Might transform the one 
> > above it as well in the process.
> 
> You mean here do not introduce irq_alloc_descs_from, but rather use
> irq_alloc_descs() directly?

My suggestion is to add irq_alloc_descs_from() as a (very 
simple) inline function and change irq_alloc_desc_from() to be 
an inline function as well.

> > > +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt)
> > > +{
> > > +	unsigned int start;
> > > +	int ret = 0;
> > > +
> > > +	if (!cnt)
> > > +		return -EINVAL;
> > > +
> > > +	mutex_lock(&sparse_irq_lock);
> > > +	start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
> > > +					   from, cnt, 0);
> > > +	mutex_unlock(&sparse_irq_lock);
> > > +	if (start + cnt > nr_irqs)
> > > +		ret = irq_can_expand_nr_irqs(start + cnt);
> > > +	return ret;
> > 
> > How is this supposed to work wrt. races?
> 
> It is not supposed. Just a quick check if there are enough bits before an
> attempt to allocate memory in __create_irqs(). Otherwise __create_irqs()
> might allocate irq_cfg's, then realize there are no bits, then deallocate
> and fail.
>
> But strictly speaking, irq_can_alloc_irqs() is unnecessary.

Why complicate it if it's unnecessary? The function is inviting 
wrong logic: it *cannot* tell whether there are enough bits, 
because the check is racy.

So I'd suggest to keep this out - this will further simplify the 
patches.

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Gordeev Oct. 4, 2012, 7:54 a.m. UTC | #4
On Tue, Oct 02, 2012 at 01:25:24PM +0200, Ingo Molnar wrote:
> > > > @@ -584,8 +586,12 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
> > > >  #define irq_alloc_desc_from(from, node)		\
> > > >  	irq_alloc_descs(-1, from, 1, node)
> > > >  
> > > > +#define irq_alloc_descs_from(from, cnt, node)	\
> > > > +	irq_alloc_descs(-1, from, cnt, node)
> > > > +
> > > 
> > > Please use inlines instead of macros. Might transform the one 
> > > above it as well in the process.
> > 
> > You mean here do not introduce irq_alloc_descs_from, but rather use
> > irq_alloc_descs() directly?
> 
> My suggestion is to add irq_alloc_descs_from() as a (very 
> simple) inline function and change irq_alloc_desc_from() to be 
> an inline function as well.

These defines were added on purpose with commit ec53cf2 ("irq: don't put
module.h into irq.h for tracking irqgen modules.") - the relevant hunk is
below. I suppose we do not want to revert it?


@@ -567,29 +567,21 @@ static inline struct msi_desc *irq_data_get_msi(struct irq_data *d)
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
 		struct module *owner);
 
-static inline int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt,
-		int node)
-{
-	return __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE);
-}
+/* use macros to avoid needing export.h for THIS_MODULE */
+#define irq_alloc_descs(irq, from, cnt, node)	\
+	__irq_alloc_descs(irq, from, cnt, node, THIS_MODULE)
 
-void irq_free_descs(unsigned int irq, unsigned int cnt);
-int irq_reserve_irqs(unsigned int from, unsigned int cnt);
+#define irq_alloc_desc(node)			\
+	irq_alloc_descs(-1, 0, 1, node)
 
-static inline int irq_alloc_desc(int node)
-{
-	return irq_alloc_descs(-1, 0, 1, node);
-}
+#define irq_alloc_desc_at(at, node)		\
+	irq_alloc_descs(at, at, 1, node)
 
-static inline int irq_alloc_desc_at(unsigned int at, int node)
-{
-	return irq_alloc_descs(at, at, 1, node);
-}
+#define irq_alloc_desc_from(from, node)		\
+	irq_alloc_descs(-1, from, 1, node)
 
-static inline int irq_alloc_desc_from(unsigned int from, int node)
-{
-	return irq_alloc_descs(-1, from, 1, node);
-}
+void irq_free_descs(unsigned int irq, unsigned int cnt);
+int irq_reserve_irqs(unsigned int from, unsigned int cnt);
 
 static inline void irq_free_desc(unsigned int irq)
 {
Ingo Molnar Oct. 4, 2012, 9:17 a.m. UTC | #5
* Alexander Gordeev <agordeev@redhat.com> wrote:

> On Tue, Oct 02, 2012 at 01:25:24PM +0200, Ingo Molnar wrote:
> > > > > @@ -584,8 +586,12 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
> > > > >  #define irq_alloc_desc_from(from, node)		\
> > > > >  	irq_alloc_descs(-1, from, 1, node)
> > > > >  
> > > > > +#define irq_alloc_descs_from(from, cnt, node)	\
> > > > > +	irq_alloc_descs(-1, from, cnt, node)
> > > > > +
> > > > 
> > > > Please use inlines instead of macros. Might transform the one 
> > > > above it as well in the process.
> > > 
> > > You mean here do not introduce irq_alloc_descs_from, but rather use
> > > irq_alloc_descs() directly?
> > 
> > My suggestion is to add irq_alloc_descs_from() as a (very 
> > simple) inline function and change irq_alloc_desc_from() to be 
> > an inline function as well.
> 
> These defines were added on purpose with commit ec53cf2 ("irq: 
> don't put module.h into irq.h for tracking irqgen modules.") - 
> the relevant hunk is below. I suppose we do not want to revert 
> it?

Sigh - that commit is really making a step backwards, but indeed 
you are probably right that reintroducing the inlines would 
create header dependency problems - which should be addressed in 
another patch.

So I concur with your original approach that added a macro.

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c265593..d5cb13c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -305,6 +305,11 @@  static int alloc_irq_from(unsigned int from, int node)
 	return irq_alloc_desc_from(from, node);
 }
 
+static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
+{
+	return irq_alloc_descs_from(from, count, node);
+}
+
 static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
 {
 	free_irq_cfg(at, cfg);
@@ -2991,37 +2996,58 @@  device_initcall(ioapic_init_ops);
 /*
  * Dynamic irq allocate and deallocation
  */
-unsigned int create_irq_nr(unsigned int from, int node)
+unsigned int __create_irqs(unsigned int from, unsigned int count, int node)
 {
-	struct irq_cfg *cfg;
+	struct irq_cfg **cfg;
 	unsigned long flags;
-	unsigned int ret = 0;
-	int irq;
+	int irq, i;
 
 	if (from < nr_irqs_gsi)
 		from = nr_irqs_gsi;
 
-	irq = alloc_irq_from(from, node);
-	if (irq < 0)
-		return 0;
-	cfg = alloc_irq_cfg(irq, node);
-	if (!cfg) {
-		free_irq_at(irq, NULL);
+	cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node);
+	if (!cfg)
 		return 0;
+
+	irq = alloc_irqs_from(from, count, node);
+	if (irq < 0)
+		goto out_cfgs;
+
+	for (i = 0; i < count; i++) {
+		cfg[i] = alloc_irq_cfg(irq + i, node);
+		if (!cfg[i])
+			goto out_irqs;
 	}
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
-		ret = irq;
+	for (i = 0; i < count; i++)
+		if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus()))
+			goto out_vecs;
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
-	if (ret) {
-		irq_set_chip_data(irq, cfg);
-		irq_clear_status_flags(irq, IRQ_NOREQUEST);
-	} else {
-		free_irq_at(irq, cfg);
+	for (i = 0; i < count; i++) {
+		irq_set_chip_data(irq + i, cfg[i]);
+		irq_clear_status_flags(irq + i, IRQ_NOREQUEST);
 	}
-	return ret;
+
+	kfree(cfg);
+	return irq;
+
+out_vecs:
+	for (; i; i--)
+		__clear_irq_vector(irq + i - 1, cfg[i - 1]);
+	raw_spin_unlock_irqrestore(&vector_lock, flags);
+out_irqs:
+	for (i = 0; i < count; i++)
+		free_irq_at(irq + i, cfg[i]);
+out_cfgs:
+	kfree(cfg);
+	return 0;
+}
+
+unsigned int create_irq_nr(unsigned int from, int node)
+{
+	return __create_irqs(from, 1, node);
 }
 
 int create_irq(void)
@@ -3054,6 +3080,27 @@  void destroy_irq(unsigned int irq)
 	free_irq_at(irq, cfg);
 }
 
+static inline void destroy_irqs(unsigned int irq, unsigned int count)
+{
+	unsigned int i;
+	for (i = 0; i < count; i++)
+		destroy_irq(irq + i);
+}
+
+static inline int
+can_create_pow_of_two_irqs(unsigned int from, unsigned int count)
+{
+	if ((count > 1) && (count % 2))
+		return -EINVAL;
+
+	for (; count; count = count / 2) {
+		if (!irq_can_alloc_irqs(from, count))
+			return count;
+	}
+
+	return -ENOSPC;
+}
+
 /*
  * MSI message composition
  */
@@ -3145,18 +3192,25 @@  static struct irq_chip msi_chip = {
 	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+			 unsigned int irq_base, unsigned int irq_offset)
 {
 	struct irq_chip *chip = &msi_chip;
 	struct msi_msg msg;
+	unsigned int irq = irq_base + irq_offset;
 	int ret;
 
 	ret = msi_compose_msg(dev, irq, &msg, -1);
 	if (ret < 0)
 		return ret;
 
-	irq_set_msi_desc(irq, msidesc);
-	write_msi_msg(irq, &msg);
+	irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
+
+	/* MSI-X message is written per-IRQ, the offset is always 0.
+	 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
+	 */
+	if (!irq_offset)
+		write_msi_msg(irq, &msg);
 
 	if (irq_remapped(irq_get_chip_data(irq))) {
 		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
@@ -3170,16 +3224,12 @@  static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 	return 0;
 }
 
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+int setup_msix_irqs(struct pci_dev *dev, int nvec)
 {
 	int node, ret, sub_handle, index = 0;
 	unsigned int irq, irq_want;
 	struct msi_desc *msidesc;
 
-	/* x86 doesn't support multiple MSI yet */
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
-
 	node = dev_to_node(&dev->dev);
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
@@ -3208,7 +3258,7 @@  int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 				goto error;
 		}
 no_ir:
-		ret = setup_msi_irq(dev, msidesc, irq);
+		ret = setup_msi_irq(dev, msidesc, irq, 0);
 		if (ret < 0)
 			goto error;
 		sub_handle++;
@@ -3220,6 +3270,76 @@  error:
 	return ret;
 }
 
+int setup_msi_irqs(struct pci_dev *dev, int nvec)
+{
+	int node, ret, sub_handle, index = 0;
+	unsigned int irq;
+	struct msi_desc *msidesc;
+
+	if (nvec > 1 && !irq_remapping_enabled)
+		return 1;
+
+	nvec = __roundup_pow_of_two(nvec);
+	ret = can_create_pow_of_two_irqs(nr_irqs_gsi, nvec);
+	if (ret != nvec)
+		return ret;
+
+	WARN_ON(!list_is_singular(&dev->msi_list));
+	msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+	WARN_ON(msidesc->irq);
+	WARN_ON(msidesc->msi_attrib.multiple);
+
+	node = dev_to_node(&dev->dev);
+	irq = __create_irqs(nr_irqs_gsi, nvec, node);
+	if (irq == 0)
+		return -ENOSPC;
+
+	if (!irq_remapping_enabled) {
+		ret = setup_msi_irq(dev, msidesc, irq, 0);
+		if (ret < 0)
+			goto error;
+		return 0;
+	}
+
+	msidesc->msi_attrib.multiple = ilog2(nvec);
+	for (sub_handle = 0; sub_handle < nvec; sub_handle++) {
+		if (!sub_handle) {
+			index = msi_alloc_remapped_irq(dev, irq, nvec);
+			if (index < 0) {
+				ret = index;
+				goto error;
+			}
+		} else {
+			ret = msi_setup_remapped_irq(dev, irq + sub_handle,
+						     index, sub_handle);
+			if (ret < 0)
+				goto error;
+		}
+		ret = setup_msi_irq(dev, msidesc, irq, sub_handle);
+		if (ret < 0)
+			goto error;
+	}
+	return 0;
+
+error:
+	destroy_irqs(irq, nvec);
+
+	/* Restore altered MSI descriptor fields and prevent just destroyed
+	 * IRQs from tearing down again in default_teardown_msi_irqs()
+	 */
+	msidesc->irq = 0;
+	msidesc->msi_attrib.multiple = 0;
+
+	return ret;
+}
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	if (type == PCI_CAP_ID_MSI)
+		return setup_msi_irqs(dev, nvec);
+	return setup_msix_irqs(dev, nvec);
+}
+
 void native_teardown_msi_irq(unsigned int irq)
 {
 	destroy_irq(irq);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 216b0ba..c3ba39f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -522,6 +522,8 @@  extern int irq_set_handler_data(unsigned int irq, void *data);
 extern int irq_set_chip_data(unsigned int irq, void *data);
 extern int irq_set_irq_type(unsigned int irq, unsigned int type);
 extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry);
+extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+				struct msi_desc *entry);
 extern struct irq_data *irq_get_irq_data(unsigned int irq);
 
 static inline struct irq_chip *irq_get_chip(unsigned int irq)
@@ -584,8 +586,12 @@  int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
 #define irq_alloc_desc_from(from, node)		\
 	irq_alloc_descs(-1, from, 1, node)
 
+#define irq_alloc_descs_from(from, cnt, node)	\
+	irq_alloc_descs(-1, from, cnt, node)
+
 void irq_free_descs(unsigned int irq, unsigned int cnt);
 int irq_reserve_irqs(unsigned int from, unsigned int cnt);
+int irq_can_alloc_irqs(unsigned int from, unsigned int cnt);
 
 static inline void irq_free_desc(unsigned int irq)
 {
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 57d86d0..2230389 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -90,27 +90,41 @@  int irq_set_handler_data(unsigned int irq, void *data)
 EXPORT_SYMBOL(irq_set_handler_data);
 
 /**
- *	irq_set_msi_desc - set MSI descriptor data for an irq
- *	@irq:	Interrupt number
- *	@entry:	Pointer to MSI descriptor data
+ *	irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ *	@irq_base:	Interrupt number base
+ *	@irq_offset:	Interrupt number offset
+ *	@entry:		Pointer to MSI descriptor data
  *
- *	Set the MSI descriptor entry for an irq
+ *	Set the MSI descriptor entry for an irq at offset
  */
-int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+			 struct msi_desc *entry)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+	struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
 	if (!desc)
 		return -EINVAL;
 	desc->irq_data.msi_desc = entry;
-	if (entry)
-		entry->irq = irq;
+	if (entry && !irq_offset)
+		entry->irq = irq_base;
 	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
 
 /**
+ *	irq_set_msi_desc - set MSI descriptor data for an irq
+ *	@irq:	Interrupt number
+ *	@entry:	Pointer to MSI descriptor data
+ *
+ *	Set the MSI descriptor entry for an irq
+ */
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+{
+	return irq_set_msi_desc_off(irq, 0, entry);
+}
+
+/**
  *	irq_set_chip_data - set irq chip data for an irq
  *	@irq:	Interrupt number
  *	@data:	Pointer to chip specific data
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302..8287b78 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -210,6 +210,13 @@  static int irq_expand_nr_irqs(unsigned int nr)
 	return 0;
 }
 
+static int irq_can_expand_nr_irqs(unsigned int nr)
+{
+	if (nr > IRQ_BITMAP_BITS)
+		return -ENOMEM;
+	return 0;
+}
+
 int __init early_irq_init(void)
 {
 	int i, initcnt, node = first_online_node;
@@ -414,6 +421,30 @@  int irq_reserve_irqs(unsigned int from, unsigned int cnt)
 }
 
 /**
+ * irq_can_alloc_irqs - checks if a range of irqs could be allocated
+ * @from:	check from irq number
+ * @cnt:	number of irqs to check
+ *
+ * Returns 0 on success or an appropriate error code
+ */
+int irq_can_alloc_irqs(unsigned int from, unsigned int cnt)
+{
+	unsigned int start;
+	int ret = 0;
+
+	if (!cnt)
+		return -EINVAL;
+
+	mutex_lock(&sparse_irq_lock);
+	start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
+					   from, cnt, 0);
+	mutex_unlock(&sparse_irq_lock);
+	if (start + cnt > nr_irqs)
+		ret = irq_can_expand_nr_irqs(start + cnt);
+	return ret;
+}
+
+/**
  * irq_get_next_irq - get next allocated irq number
  * @offset:	where to start the search
  *