diff mbox

[v2,2/2] Intel-IOMMU, intr-remap: source-id checking

Message ID 1242757912-6041-3-git-send-email-weidong.han@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Han, Weidong May 19, 2009, 6:31 p.m. UTC
To support domain-isolation usages, the platform hardware must be
capable of uniquely identifying the requestor (source-id) for each
interrupt message. Without source-id checking for interrupt remapping
, a rouge guest/VM with assigned devices can launch interrupt attacks
to bring down anothe guest/VM or the VMM itself.

This patch adds source-id checking for interrupt remapping, and then
really isolates interrupts for guests/VMs with assigned devices.

Because PCI subsystem is not initialized yet when set up IOAPIC
entries, use read_pci_config_byte to access PCI config space directly.

Signed-off-by: Weidong Han <weidong.han@intel.com>
---
 arch/x86/kernel/apic/io_apic.c |    6 +++
 drivers/pci/intr_remapping.c   |   90 ++++++++++++++++++++++++++++++++++++++-
 drivers/pci/intr_remapping.h   |    2 +
 include/linux/dmar.h           |   11 +++++
 4 files changed, 106 insertions(+), 3 deletions(-)

Comments

Ingo Molnar May 19, 2009, 11:50 a.m. UTC | #1
* Weidong Han <weidong.han@intel.com> wrote:

> To support domain-isolation usages, the platform hardware must be 
> capable of uniquely identifying the requestor (source-id) for each 
> interrupt message. Without source-id checking for interrupt 
> remapping , a rouge guest/VM with assigned devices can launch 
> interrupt attacks to bring down anothe guest/VM or the VMM itself.
> 
> This patch adds source-id checking for interrupt remapping, and 
> then really isolates interrupts for guests/VMs with assigned 
> devices.
> 
> Because PCI subsystem is not initialized yet when set up IOAPIC 
> entries, use read_pci_config_byte to access PCI config space 
> directly.
> 
> Signed-off-by: Weidong Han <weidong.han@intel.com>
> ---
>  arch/x86/kernel/apic/io_apic.c |    6 +++
>  drivers/pci/intr_remapping.c   |   90 ++++++++++++++++++++++++++++++++++++++-
>  drivers/pci/intr_remapping.h   |    2 +
>  include/linux/dmar.h           |   11 +++++
>  4 files changed, 106 insertions(+), 3 deletions(-)

Code structure looks nice now. (and i susect you have tested this on 
real and relevant hardware?) I've Cc:-ed Eric too ... does this 
direction look good to you too Eric?

Have a few minor nits only:

> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index 30da617..3d10c68 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -1559,6 +1559,9 @@ int setup_ioapic_entry(int apic_id, int irq,
>  		irte.vector = vector;
>  		irte.dest_id = IRTE_DEST(destination);
>  
> +		/* Set source-id of interrupt request */
> +		set_ioapic_sid(&irte, apic_id);
> +
>  		modify_irte(irq, &irte);
>  
>  		ir_entry->index2 = (index >> 15) & 0x1;
> @@ -3329,6 +3332,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
>  		irte.vector = cfg->vector;
>  		irte.dest_id = IRTE_DEST(dest);
>  
> +		/* Set source-id of interrupt request */
> +		set_msi_sid(&irte, pdev);
> +
>  		modify_irte(irq, &irte);
>  
>  		msg->address_hi = MSI_ADDR_BASE_HI;
> diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
> index 946e170..9ef7b0d 100644
> --- a/drivers/pci/intr_remapping.c
> +++ b/drivers/pci/intr_remapping.c
> @@ -10,6 +10,8 @@
>  #include <linux/intel-iommu.h>
>  #include "intr_remapping.h"
>  #include <acpi/acpi.h>
> +#include <asm/pci-direct.h>
> +#include "pci.h"
>  
>  static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
>  static int ir_ioapic_num;
> @@ -405,6 +407,61 @@ int free_irte(int irq)
>  	return rc;
>  }
>  
> +int set_ioapic_sid(struct irte *irte, int apic)
> +{
> +	int i;
> +	u16 sid = 0;
> +
> +	if (!irte)
> +		return -1;
> +
> +	for (i = 0; i < MAX_IO_APICS; i++)
> +		if (ir_ioapic[i].id == apic) {
> +			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
> +			break;
> +		}

Please generally put extra curly braces around each multi-line loop 
body. One reason beyond readability is robustness: the above 
structure can be easily extended in a buggy way via [mockup patch 
hunk]:

> 			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
> 			break;
> 		}
> +		if (!sid)
> +			break;

And note that if this slips in by accident how unobvious this bug is 
during patch review - the loop head context is not present in the 
3-line default context and the code "looks" correct at a glance.

With extra braces, such typos or mismerges:

> 		}
> 	}
> +		if (!sid)
> +			break;

stick out during review like a sore thumb :-)

> +	if (sid == 0) {
> +		printk(KERN_WARNING "Failed to set source-id of "
> +		       "I/O APIC (%d), because it is not under "
> +		       "any DRHD\n", apic);
> +		return -1;
> +	}

please try to keep kernel messages on a single line - even if 
checkpatch complains. Also, it's a good idea to use pr_warning(), 
it's shorter by 8 characters.

> +
> +	irte->svt = 1; /* requestor ID verification SID/SQ */
> +	irte->sq = 0;  /* comparing all 16-bit of SID */
> +	irte->sid = sid;

this is a borderline suggestion:

Note how you already lined up the _comments_ vertically, so you did 
notice that it makes sense to align vertically. The same visual 
arguments can be made for the initialization itself too:

> +
> +	irte->svt	= 1;	/* requestor ID verification SID/SQ */
> +	irte->sq	= 0;	/* comparing all 16-bit of SID */
> +	irte->sid	= sid;
> +
> +	return 0;

But ... it might make even more sense to introduce a set_irte() 
helper method, so it can all be written in a single line as:

	set_irte(irte, 1, 0, sid);

and explain common values in the set_irte() function's description - 
that way those comments above (and below) dont have to be made at 
the usage sites.

> +}
> +
> +int set_msi_sid(struct irte *irte, struct pci_dev *dev)
> +{
> +	struct pci_dev *tmp;
> +
> +	if (!irte || !dev)
> +		return -1;
> +
> +	tmp = pci_find_upstream_pcie_bridge(dev);

variable names like 'tmp' are only used if they are _really_ short 
in scope. Here a much better name would be 'bridge'.

> +	if (!tmp) { /* PCIE device or integrated PCI device */
> +		irte->svt = 1; /* verify requestor ID verification SID/SQ */
> +		irte->sq = 0;  /* comparing all 16-bit of SID */
> +		irte->sid = (dev->bus->number << 8) | dev->devfn;
> +		return 0;
> +	}
> +
> +	if (tmp->is_pcie) { /* this is a PCIE-to-PCI/PCIX bridge */
> +		irte->svt = 2; /* verify request ID verification SID */
> +		irte->sid = (tmp->bus->number << 8) | dev->bus->number;

is irte->sq left alone intentionally?

> +	} else { /* this is a legacy PCI bridge */
> +		irte->svt = 1; /* verify requestor ID verification SID/SQ */
> +		irte->sq = 0;  /* comparing all 16-bit of SID */
> +		irte->sid = (tmp->bus->number << 8) | tmp->devfn;
> +	}
> +	if (tmp->is_pcie) { /* this is a PCIE-to-PCI/PCIX bridge */
> +		irte->svt = 2; /* verify request ID verification SID */
> +		irte->sid = (tmp->bus->number << 8) | dev->bus->number;

here too?

> +	} else { /* this is a legacy PCI bridge */
> +		irte->svt = 1; /* verify requestor ID verification SID/SQ */
> +		irte->sq = 0;  /* comparing all 16-bit of SID */
> +		irte->sid = (tmp->bus->number << 8) | tmp->devfn;
> +	}
> +
> +	return 0;
> +}
> +
>  static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
>  {
>  	u64 addr;
> @@ -610,6 +667,35 @@ error:
>  	return -1;
>  }
>  
> +static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
> +				      struct intel_iommu *iommu)
> +{
> +	struct acpi_dmar_pci_path *path;
> +	u8 bus;
> +	int count;
> +
> +	bus = scope->bus;
> +	path = (struct acpi_dmar_pci_path *)(scope + 1);
> +	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
> +		/ sizeof(struct acpi_dmar_pci_path);
> +
> +	while (--count > 0) {
> +		/*
> +		 * Access PCI directly due to the PCI
> +		 * subsystem isn't initialized yet.
> +		 */
> +		bus = read_pci_config_byte(bus, path->dev, path->fn,
> +					   PCI_SECONDARY_BUS);
> +		path++;
> +	}
> +
> +	ir_ioapic[ir_ioapic_num].bus = bus;
> +	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
> +	ir_ioapic[ir_ioapic_num].iommu = iommu;
> +	ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;

this too can be aligned vertically.

> +	ir_ioapic_num++;
> +}
> +
>  static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
>  				 struct intel_iommu *iommu)
>  {
> @@ -634,9 +720,7 @@ static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
>  			       " 0x%Lx\n", scope->enumeration_id,
>  			       drhd->address);
>  
> -			ir_ioapic[ir_ioapic_num].iommu = iommu;
> -			ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;
> -			ir_ioapic_num++;
> +			ir_parse_one_ioapic_scope(scope, iommu);

how much nicer this helper looks like now!

>  		}
>  		start += scope->length;
>  	}
> diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h
> index ca48f0d..dd35780 100644
> --- a/drivers/pci/intr_remapping.h
> +++ b/drivers/pci/intr_remapping.h
> @@ -3,6 +3,8 @@
>  struct ioapic_scope {
>  	struct intel_iommu *iommu;
>  	unsigned int id;
> +	u8 bus;			/* PCI bus number */
> +	u8 devfn;		/* PCI devfn number */

hm, in kernel data structures we usually encode devfn as unsigned 
int - and sometimes even bus. Only 8 bits will be used of both so 
it's the same end result, but it results in more efficient 
instructions without byte shuffling.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Han, Weidong May 19, 2009, 2:12 p.m. UTC | #2
Ingo Molnar wrote:
> * Weidong Han <weidong.han@intel.com> wrote:
> 
>> To support domain-isolation usages, the platform hardware must be
>> capable of uniquely identifying the requestor (source-id) for each
>> interrupt message. Without source-id checking for interrupt
>> remapping , a rouge guest/VM with assigned devices can launch
>> interrupt attacks to bring down anothe guest/VM or the VMM itself.
>> 
>> This patch adds source-id checking for interrupt remapping, and
>> then really isolates interrupts for guests/VMs with assigned
>> devices.
>> 
>> Because PCI subsystem is not initialized yet when set up IOAPIC
>> entries, use read_pci_config_byte to access PCI config space
>> directly.
>> 
>> Signed-off-by: Weidong Han <weidong.han@intel.com>
>> ---
>>  arch/x86/kernel/apic/io_apic.c |    6 +++
>>  drivers/pci/intr_remapping.c   |   90
>>  ++++++++++++++++++++++++++++++++++++++-
>>  drivers/pci/intr_remapping.h   |    2 + include/linux/dmar.h       
>>  |   11 +++++ 4 files changed, 106 insertions(+), 3 deletions(-)
> 
> Code structure looks nice now. (and i susect you have tested this on
> real and relevant hardware?) I've Cc:-ed Eric too ... does this
> direction look good to you too Eric?
> 
> Have a few minor nits only:
> 
>> diff --git a/arch/x86/kernel/apic/io_apic.c
>> b/arch/x86/kernel/apic/io_apic.c 
>> index 30da617..3d10c68 100644
>> --- a/arch/x86/kernel/apic/io_apic.c
>> +++ b/arch/x86/kernel/apic/io_apic.c
>> @@ -1559,6 +1559,9 @@ int setup_ioapic_entry(int apic_id, int irq, 
>>  		irte.vector = vector; irte.dest_id = IRTE_DEST(destination);
>> 
>> +		/* Set source-id of interrupt request */
>> +		set_ioapic_sid(&irte, apic_id);
>> +
>>  		modify_irte(irq, &irte);
>> 
>>  		ir_entry->index2 = (index >> 15) & 0x1;
>> @@ -3329,6 +3332,9 @@ static int msi_compose_msg(struct pci_dev
>>  		*pdev, unsigned int irq, struct msi_ms  		irte.vector =
>> cfg->vector; irte.dest_id = IRTE_DEST(dest); 
>> 
>> +		/* Set source-id of interrupt request */
>> +		set_msi_sid(&irte, pdev);
>> +
>>  		modify_irte(irq, &irte);
>> 
>>  		msg->address_hi = MSI_ADDR_BASE_HI;
>> diff --git a/drivers/pci/intr_remapping.c
>> b/drivers/pci/intr_remapping.c 
>> index 946e170..9ef7b0d 100644
>> --- a/drivers/pci/intr_remapping.c
>> +++ b/drivers/pci/intr_remapping.c
>> @@ -10,6 +10,8 @@
>>  #include <linux/intel-iommu.h>
>>  #include "intr_remapping.h"
>>  #include <acpi/acpi.h>
>> +#include <asm/pci-direct.h>
>> +#include "pci.h"
>> 
>>  static struct ioapic_scope ir_ioapic[MAX_IO_APICS];  static int
>> ir_ioapic_num; @@ -405,6 +407,61 @@ int free_irte(int irq)
>>  	return rc;
>>  }
>> 
>> +int set_ioapic_sid(struct irte *irte, int apic)
>> +{
>> +	int i;
>> +	u16 sid = 0;
>> +
>> +	if (!irte)
>> +		return -1;
>> +
>> +	for (i = 0; i < MAX_IO_APICS; i++)
>> +		if (ir_ioapic[i].id == apic) {
>> +			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; +			break;
>> +		}
> 
> Please generally put extra curly braces around each multi-line loop
> body. One reason beyond readability is robustness: the above
> structure can be easily extended in a buggy way via [mockup patch
> hunk]:
> 
>> 			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; 			break;
>> 		}
>> +		if (!sid)
>> +			break;
> 
> And note that if this slips in by accident how unobvious this bug is
> during patch review - the loop head context is not present in the
> 3-line default context and the code "looks" correct at a glance.
> 
> With extra braces, such typos or mismerges:
> 
>> 		}
>> 	}
>> +		if (!sid)
>> +			break;
> 
> stick out during review like a sore thumb :-)
> 
>> +	if (sid == 0) {
>> +		printk(KERN_WARNING "Failed to set source-id of "
>> +		       "I/O APIC (%d), because it is not under "
>> +		       "any DRHD\n", apic);
>> +		return -1;
>> +	}
> 
> please try to keep kernel messages on a single line - even if
> checkpatch complains. Also, it's a good idea to use pr_warning(),
> it's shorter by 8 characters.
> 
>> +
>> +	irte->svt = 1; /* requestor ID verification SID/SQ */
>> +	irte->sq = 0;  /* comparing all 16-bit of SID */
>> +	irte->sid = sid;
> 
> this is a borderline suggestion:
> 
> Note how you already lined up the _comments_ vertically, so you did
> notice that it makes sense to align vertically. The same visual
> arguments can be made for the initialization itself too:
> 
>> +
>> +	irte->svt	= 1;	/* requestor ID verification SID/SQ */
>> +	irte->sq	= 0;	/* comparing all 16-bit of SID */
>> +	irte->sid	= sid;
>> +
>> +	return 0;
> 
> But ... it might make even more sense to introduce a set_irte()
> helper method, so it can all be written in a single line as:
> 
> 	set_irte(irte, 1, 0, sid);
> 
> and explain common values in the set_irte() function's description -
> that way those comments above (and below) dont have to be made at
> the usage sites.
> 
>> +}
>> +
>> +int set_msi_sid(struct irte *irte, struct pci_dev *dev) +{
>> +	struct pci_dev *tmp;
>> +
>> +	if (!irte || !dev)
>> +		return -1;
>> +
>> +	tmp = pci_find_upstream_pcie_bridge(dev);
> 
> variable names like 'tmp' are only used if they are _really_ short
> in scope. Here a much better name would be 'bridge'.
> 
>> +	if (!tmp) { /* PCIE device or integrated PCI device */
>> +		irte->svt = 1; /* verify requestor ID verification SID/SQ */
>> +		irte->sq = 0;  /* comparing all 16-bit of SID */
>> +		irte->sid = (dev->bus->number << 8) | dev->devfn; +		return 0;
>> +	}
>> +
>> +	if (tmp->is_pcie) { /* this is a PCIE-to-PCI/PCIX bridge */
>> +		irte->svt = 2; /* verify request ID verification SID */
>> +		irte->sid = (tmp->bus->number << 8) | dev->bus->number;
> 
> is irte->sq left alone intentionally?

Yes, sq will be ignored when svt=2. 

And thanks for your other comments. Will include them in next version.

Regards,
Weidong


> 
>> +	} else { /* this is a legacy PCI bridge */
>> +		irte->svt = 1; /* verify requestor ID verification SID/SQ */
>> +		irte->sq = 0;  /* comparing all 16-bit of SID */
>> +		irte->sid = (tmp->bus->number << 8) | tmp->devfn; +	}
>> +	if (tmp->is_pcie) { /* this is a PCIE-to-PCI/PCIX bridge */
>> +		irte->svt = 2; /* verify request ID verification SID */
>> +		irte->sid = (tmp->bus->number << 8) | dev->bus->number;
> 
> here too?
> 
>> +	} else { /* this is a legacy PCI bridge */
>> +		irte->svt = 1; /* verify requestor ID verification SID/SQ */
>> +		irte->sq = 0;  /* comparing all 16-bit of SID */
>> +		irte->sid = (tmp->bus->number << 8) | tmp->devfn; +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static void iommu_set_intr_remapping(struct intel_iommu *iommu, int
>>  	mode)  { u64 addr;
>> @@ -610,6 +667,35 @@ error:
>>  	return -1;
>>  }
>> 
>> +static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope
>> *scope, +				      struct intel_iommu *iommu)
>> +{
>> +	struct acpi_dmar_pci_path *path;
>> +	u8 bus;
>> +	int count;
>> +
>> +	bus = scope->bus;
>> +	path = (struct acpi_dmar_pci_path *)(scope + 1);
>> +	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
>> +		/ sizeof(struct acpi_dmar_pci_path);
>> +
>> +	while (--count > 0) {
>> +		/*
>> +		 * Access PCI directly due to the PCI
>> +		 * subsystem isn't initialized yet.
>> +		 */
>> +		bus = read_pci_config_byte(bus, path->dev, path->fn, +					  
>> PCI_SECONDARY_BUS); +		path++;
>> +	}
>> +
>> +	ir_ioapic[ir_ioapic_num].bus = bus;
>> +	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
>> +	ir_ioapic[ir_ioapic_num].iommu = iommu;
>> +	ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;
> 
> this too can be aligned vertically.
> 
>> +	ir_ioapic_num++;
>> +}
>> +
>>  static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
>>  				 struct intel_iommu *iommu)
>>  {
>> @@ -634,9 +720,7 @@ static int ir_parse_ioapic_scope(struct
>>  			       acpi_dmar_header *header, " 0x%Lx\n",
>>  			       scope->enumeration_id, drhd->address);
>> 
>> -			ir_ioapic[ir_ioapic_num].iommu = iommu;
>> -			ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;
>> -			ir_ioapic_num++;
>> +			ir_parse_one_ioapic_scope(scope, iommu);
> 
> how much nicer this helper looks like now!
> 
>>  		}
>>  		start += scope->length;
>>  	}
>> diff --git a/drivers/pci/intr_remapping.h
>> b/drivers/pci/intr_remapping.h 
>> index ca48f0d..dd35780 100644
>> --- a/drivers/pci/intr_remapping.h
>> +++ b/drivers/pci/intr_remapping.h
>> @@ -3,6 +3,8 @@
>>  struct ioapic_scope {
>>  	struct intel_iommu *iommu;
>>  	unsigned int id;
>> +	u8 bus;			/* PCI bus number */
>> +	u8 devfn;		/* PCI devfn number */
> 
> hm, in kernel data structures we usually encode devfn as unsigned
> int - and sometimes even bus. Only 8 bits will be used of both so
> it's the same end result, but it results in more efficient
> instructions without byte shuffling.
> 
> 	Ingo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman May 19, 2009, 7:28 p.m. UTC | #3
Ingo Molnar <mingo@elte.hu> writes:

> * Weidong Han <weidong.han@intel.com> wrote:
>
>> To support domain-isolation usages, the platform hardware must be 
>> capable of uniquely identifying the requestor (source-id) for each 
>> interrupt message. Without source-id checking for interrupt 
>> remapping , a rouge guest/VM with assigned devices can launch 
>> interrupt attacks to bring down anothe guest/VM or the VMM itself.
>> 
>> This patch adds source-id checking for interrupt remapping, and 
>> then really isolates interrupts for guests/VMs with assigned 
>> devices.
>> 
>> Because PCI subsystem is not initialized yet when set up IOAPIC 
>> entries, use read_pci_config_byte to access PCI config space 
>> directly.
>> 
>> Signed-off-by: Weidong Han <weidong.han@intel.com>
>> ---
>>  arch/x86/kernel/apic/io_apic.c |    6 +++
>>  drivers/pci/intr_remapping.c   |   90 ++++++++++++++++++++++++++++++++++++++-
>>  drivers/pci/intr_remapping.h   |    2 +
>>  include/linux/dmar.h           |   11 +++++
>>  4 files changed, 106 insertions(+), 3 deletions(-)
>
> Code structure looks nice now. (and i susect you have tested this on 
> real and relevant hardware?) I've Cc:-ed Eric too ... does this 
> direction look good to you too Eric?

Being a major nitpick, I have to point out that the code is not
structured to support other iommus, and I think AMD has one that can
do this as well. 

The early pci reading of the bus is just wrong.  What happens if the
pci layer decided to renumber things?  It looks like we have a real
dependency on pci there and are avoiding sorting it out with this.

Hmm.  But that is what we use in setup_ioapic_sid....
I expect the right solution is to delay enabling ioapic entries
until driver enable them.   That could also reduce screaming
irqs during bootup in the kdump case.

set_msi_sid looks wrong.  The comment are unhelpful. irte->svt should
get an enum value or a deine (removing the repeated explanations of
the magic value) and then we could have room to explain why we
are doing what we are doing.

Not finding an upstream pcie_bridge and then concluding we are a pcie
device seems bogus.

Why if we do have an upstream pcie bridge do we only want to do a bus
range verification instead of checking just for the bus :devfn?

The legacy PCI case seems even stranger.

....

The table of apic information by apic_id also seems wrong.   Don't
we have chip_data or something that should point it  that we can
get from the irq?

Eric
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar May 20, 2009, 8:21 a.m. UTC | #4
* Eric W. Biederman <ebiederm@xmission.com> wrote:

> Ingo Molnar <mingo@elte.hu> writes:
> 
> > * Weidong Han <weidong.han@intel.com> wrote:
> >
> >> To support domain-isolation usages, the platform hardware must be 
> >> capable of uniquely identifying the requestor (source-id) for each 
> >> interrupt message. Without source-id checking for interrupt 
> >> remapping , a rouge guest/VM with assigned devices can launch 
> >> interrupt attacks to bring down anothe guest/VM or the VMM itself.
> >> 
> >> This patch adds source-id checking for interrupt remapping, and 
> >> then really isolates interrupts for guests/VMs with assigned 
> >> devices.
> >> 
> >> Because PCI subsystem is not initialized yet when set up IOAPIC 
> >> entries, use read_pci_config_byte to access PCI config space 
> >> directly.
> >> 
> >> Signed-off-by: Weidong Han <weidong.han@intel.com>
> >> ---
> >>  arch/x86/kernel/apic/io_apic.c |    6 +++
> >>  drivers/pci/intr_remapping.c   |   90 ++++++++++++++++++++++++++++++++++++++-
> >>  drivers/pci/intr_remapping.h   |    2 +
> >>  include/linux/dmar.h           |   11 +++++
> >>  4 files changed, 106 insertions(+), 3 deletions(-)
> >
> > Code structure looks nice now. (and i susect you have tested this on 
> > real and relevant hardware?) I've Cc:-ed Eric too ... does this 
> > direction look good to you too Eric?
> 
> Being a major nitpick, I have to point out that the code is not 
> structured to support other iommus, and I think AMD has one that 
> can do this as well.

(Joerg Cc:-ed)

> The early pci reading of the bus is just wrong.  What happens if 
> the pci layer decided to renumber things?  It looks like we have a 
> real dependency on pci there and are avoiding sorting it out with 
> this.

Yes ... but is there much we can do about this bootstrap dependency? 
We want to enable the IO-APIC very early in its final form. There's 
quite a bit of IRQ functionality that doesnt go via the PCI layer, 
and which is being relied on by early bootup. The timer irq must 
work, etc.

> Hmm.  But that is what we use in setup_ioapic_sid.... I expect the 
> right solution is to delay enabling ioapic entries until driver 
> enable them.  That could also reduce screaming irqs during bootup 
> in the kdump case.

Yes, and note that we are moving in that direction in tip:irq/numa 
(Yinghai Cc:-ed) - we are delaying IRQ setup to the very last 
moment. We recently got rid of 32-bit IRQ compression in that branch 
as well and the IRQ vectoring code is now unified between 64-bit and 
32-bit so it's nify and you might want to check it and look for 
holes ...

( The motivation there was different: delay allocation of the 
  irq_desc so that we can make it NUMA-local - but it has the same 
  general effect. )

> set_msi_sid looks wrong.  The comment are unhelpful. irte->svt 
> should get an enum value or a deine (removing the repeated 
> explanations of the magic value) and then we could have room to 
> explain why we are doing what we are doing.

(yes, it probably wants a helper method - i pointed these smaller 
details out in my review.)

> Not finding an upstream pcie_bridge and then concluding we are a 
> pcie device seems bogus.
> 
> Why if we do have an upstream pcie bridge do we only want to do a 
> bus range verification instead of checking just for the bus 
> :devfn?
> 
> The legacy PCI case seems even stranger.

Good points. Would be nice to get an ack from the PCI folks to make 
sure these bits are sane.

> ....
> 
> The table of apic information by apic_id also seems wrong.  Don't 
> we have chip_data or something that should point it that we can 
> get from the irq?

->chip_data is already used for io-apic configuration bits - if it's 
reused then the right way to do it is to extend struct irq_cfg in 
arch/x86/kernel/apic/io_apic.c.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Han, Weidong May 20, 2009, 8:38 a.m. UTC | #5
Ingo Molnar wrote:
> * Eric W. Biederman <ebiederm@xmission.com> wrote:
> 
>> Ingo Molnar <mingo@elte.hu> writes:
>> 
>>> * Weidong Han <weidong.han@intel.com> wrote:
>>> 
>>>> To support domain-isolation usages, the platform hardware must be
>>>> capable of uniquely identifying the requestor (source-id) for each
>>>> interrupt message. Without source-id checking for interrupt
>>>> remapping , a rouge guest/VM with assigned devices can launch
>>>> interrupt attacks to bring down anothe guest/VM or the VMM itself.
>>>> 
>>>> This patch adds source-id checking for interrupt remapping, and
>>>> then really isolates interrupts for guests/VMs with assigned
>>>> devices.
>>>> 
>>>> Because PCI subsystem is not initialized yet when set up IOAPIC
>>>> entries, use read_pci_config_byte to access PCI config space
>>>> directly.
>>>> 
>>>> Signed-off-by: Weidong Han <weidong.han@intel.com>
>>>> ---
>>>>  arch/x86/kernel/apic/io_apic.c |    6 +++
>>>>  drivers/pci/intr_remapping.c   |   90
>>>>  ++++++++++++++++++++++++++++++++++++++-
>>>>  drivers/pci/intr_remapping.h   |    2 + include/linux/dmar.h     
>>>>  |   11 +++++ 4 files changed, 106 insertions(+), 3 deletions(-)
>>> 
>>> Code structure looks nice now. (and i susect you have tested this on
>>> real and relevant hardware?) I've Cc:-ed Eric too ... does this
>>> direction look good to you too Eric?
>> 
>> Being a major nitpick, I have to point out that the code is not
>> structured to support other iommus, and I think AMD has one that
>> can do this as well.
> 
> (Joerg Cc:-ed)

The patch just changes code in interrupt remapping of Intel IOMMU. How does AMD IOMMU remap interrupts in ioapic code? I didn't find relative code. If AMD IOMMU already enabled the same code, it can wrap the same code in IOMMU API. 

> 
>> The early pci reading of the bus is just wrong.  What happens if
>> the pci layer decided to renumber things?  It looks like we have a
>> real dependency on pci there and are avoiding sorting it out with
>> this.
> 
> Yes ... but is there much we can do about this bootstrap dependency?
> We want to enable the IO-APIC very early in its final form. There's
> quite a bit of IRQ functionality that doesnt go via the PCI layer,
> and which is being relied on by early bootup. The timer irq must
> work, etc.

Currently VT-d code didn't support pci rebalance. There is much work to do. It needs to track devcie identity changes and resultant imapct to Device Scope under each VT-d engine.

> 
>> Hmm.  But that is what we use in setup_ioapic_sid.... I expect the
>> right solution is to delay enabling ioapic entries until driver
>> enable them.  That could also reduce screaming irqs during bootup
>> in the kdump case.
> 
> Yes, and note that we are moving in that direction in tip:irq/numa
> (Yinghai Cc:-ed) - we are delaying IRQ setup to the very last
> moment. We recently got rid of 32-bit IRQ compression in that branch
> as well and the IRQ vectoring code is now unified between 64-bit and
> 32-bit so it's nify and you might want to check it and look for
> holes ...
> 
> ( The motivation there was different: delay allocation of the
>   irq_desc so that we can make it NUMA-local - but it has the same
>   general effect. )
> 
>> set_msi_sid looks wrong.  The comment are unhelpful. irte->svt
>> should get an enum value or a deine (removing the repeated
>> explanations of the magic value) and then we could have room to
>> explain why we are doing what we are doing.
> 
> (yes, it probably wants a helper method - i pointed these smaller
> details out in my review.)

will update as Ingo suggested.

Regards,
Weidong

> 
>> Not finding an upstream pcie_bridge and then concluding we are a
>> pcie device seems bogus. 
>> 
>> Why if we do have an upstream pcie bridge do we only want to do a
>> bus range verification instead of checking just for the bus
>>> devfn?
>> 
>> The legacy PCI case seems even stranger.
> 
> Good points. Would be nice to get an ack from the PCI folks to make
> sure these bits are sane.
> 
>> ....
>> 
>> The table of apic information by apic_id also seems wrong.  Don't
>> we have chip_data or something that should point it that we can
>> get from the irq?
> 
> ->chip_data is already used for io-apic configuration bits - if it's
> reused then the right way to do it is to extend struct irq_cfg in
> arch/x86/kernel/apic/io_apic.c.
> 
> 	Ingo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joerg Roedel May 20, 2009, 9:43 a.m. UTC | #6
On Wed, May 20, 2009 at 10:21:14AM +0200, Ingo Molnar wrote:
> 
> * Eric W. Biederman <ebiederm@xmission.com> wrote:
> > Being a major nitpick, I have to point out that the code is not 
> > structured to support other iommus, and I think AMD has one that 
> > can do this as well.
> 
> (Joerg Cc:-ed)

The AMD IOMMU does also have an interrupt remapping feature. But it is
currently unsupported in the Linux driver. When this is going to be
implemented I think we can go the same way as for device passthrough in
KVM. This was an Intel-only feature too and was later adapted to support
AMD IOMMU too.

Joerg
Eric W. Biederman May 20, 2009, 12:02 p.m. UTC | #7
Joerg Roedel <joerg.roedel@amd.com> writes:

> On Wed, May 20, 2009 at 10:21:14AM +0200, Ingo Molnar wrote:
>> 
>> * Eric W. Biederman <ebiederm@xmission.com> wrote:
>> > Being a major nitpick, I have to point out that the code is not 
>> > structured to support other iommus, and I think AMD has one that 
>> > can do this as well.
>> 
>> (Joerg Cc:-ed)
>
> The AMD IOMMU does also have an interrupt remapping feature. But it is
> currently unsupported in the Linux driver. When this is going to be
> implemented I think we can go the same way as for device passthrough in
> KVM. This was an Intel-only feature too and was later adapted to support
> AMD IOMMU too.

The Intel IOMMU is also interesting because it allows traffic from
buggy ioapcis to be remapped safely in process context, adding a large
margin of safety and flexibility to the existing irq migration
and cpu hotunplug scenarios.

Can we use the AMD IOMMU to achieve the same objective?

I would really like to say from point X on AMD and Intel systems don't
have to worry about irq migrations used caused by old buggy ioapic
state machines, which I have previously confirmed in both AMD and
Intel ioapics.

Eric
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman May 20, 2009, 12:13 p.m. UTC | #8
"Han, Weidong" <weidong.han@intel.com> writes:

>>> The early pci reading of the bus is just wrong.  What happens if
>>> the pci layer decided to renumber things?  It looks like we have a
>>> real dependency on pci there and are avoiding sorting it out with
>>> this.
>> 
>> Yes ... but is there much we can do about this bootstrap dependency?
>> We want to enable the IO-APIC very early in its final form. There's
>> quite a bit of IRQ functionality that doesnt go via the PCI layer,
>> and which is being relied on by early bootup. The timer irq must
>> work, etc.
>
> Currently VT-d code didn't support pci rebalance. There is much work to do. It needs to track devcie identity changes and resultant imapct to Device Scope under each VT-d engine.

Sure.  I am thinking of the much simpler case where the BIOS goofs up
and we need to apply some fixes at boot time.  If you can't
delay things enough to get your hands on the pci dev of your
iommu.  If you can't get that far supporting pci rebalance under VT-d is
impossible.

Eric
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman May 20, 2009, 12:24 p.m. UTC | #9
Ingo Molnar <mingo@elte.hu> writes:

>> The table of apic information by apic_id also seems wrong.  Don't 
>> we have chip_data or something that should point it that we can 
>> get from the irq?
>
> ->chip_data is already used for io-apic configuration bits - if it's 
> reused then the right way to do it is to extend struct irq_cfg in 
> arch/x86/kernel/apic/io_apic.c.

We are talking about some extra io-apic configuration bits, that
happen to currently be private to dmar.c.

chip_data, handler_data I can never keep those straight except
when I am deep in the code.  What I do know is that encouraging
more arrays of size MAX_IO_APICS seems like the wrong direction.

Which probably means adding the information we need to struct irq_cfg.

Eric
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Han, Weidong May 21, 2009, 9 a.m. UTC | #10
Ingo Molnar wrote:
> * Eric W. Biederman <ebiederm@xmission.com> wrote:
> 
>> Not finding an upstream pcie_bridge and then concluding we are a
>> pcie device seems bogus. 
>> 

If device is pcie, pci_find_upstream_pcie_bridge() will return NULL. For root complex integrated device, it won't find upstream bridge, and also return NULL. What's more, pcie device and root complex integrated device will be handled as the same way to set sid, so I mix them here. But it also returns NULL for busted hardware. I think no parent bus can be considered Root Complex integrated device, right? If so, I think can handle it as follows:

	...
	if (dev->is_pcie || !dev->bus->parent) {
		set_irte_sid(...);
		return 0;
	}

	bridge = pci_find_upstream_pcie_bridge(dev);
	if (bridge) {
		if (bridge->is_pcie) /* PCIE-to-PCI/PCIx bridge */
			set_irte_sid(...);
		else /* legacy PCI bridge */
			set_irte_sid(..);
	}

	return 0;

>> Why if we do have an upstream pcie bridge do we only want to do a
>> bus range verification instead of checking just for the bus
>>> devfn?
>> 
>> The legacy PCI case seems even stranger.

Why? If a PCI device isn't connected to a PCIe bridge, it should be a legacy bridge.

Regards,
Weidong

> 
> Good points. Would be nice to get an ack from the PCI folks to make
> sure these bits are sane.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman May 21, 2009, 10:04 a.m. UTC | #11
"Han, Weidong" <weidong.han@intel.com> writes:

> Ingo Molnar wrote:
>> * Eric W. Biederman <ebiederm@xmission.com> wrote:
>> 
>>> Not finding an upstream pcie_bridge and then concluding we are a
>>> pcie device seems bogus. 
>>> 
>
> If device is pcie, pci_find_upstream_pcie_bridge() will return
> NULL. For root complex integrated device, it won't find upstream
> bridge, and also return NULL. What's more, pcie device and root
> complex integrated device will be handled as the same way to set
> sid, so I mix them here. But it also returns NULL for busted
> hardware. I think no parent bus can be considered Root Complex
> integrated device, right? If so, I think can handle it as follows:

I have problems with pci_find_upstream_pcie_bridge.  The
name is actively misleading about what that function does.
Returning a pci_to_pci bridge is strongly counter intuitive
give that name.

Can we change it to pci_find_upstream_pcie_to_pci_bridge.
Returning NULL in all cases when there is not an upstream
pcie_to_pci bridge.

For the set_sid case that is ideal.  For the other cases in
intel-iommu.c it may be a problem.  Is it even possible to have a
genuine pci device not behind a pcie to pci bridge on an intel
chipset with this iommu?


> 	...
> 	if (dev->is_pcie || !dev->bus->parent) {
> 		set_irte_sid(...);
> 		return 0;
> 	}
>
> 	bridge = pci_find_upstream_pcie_bridge(dev);
> 	if (bridge) {
> 		if (bridge->is_pcie) /* PCIE-to-PCI/PCIx bridge */
> 			set_irte_sid(...);
> 		else /* legacy PCI bridge */
> 			set_irte_sid(..);
> 	}
>
> 	return 0;
>
>>> Why if we do have an upstream pcie bridge do we only want to do a
>>> bus range verification instead of checking just for the bus
>>>> devfn?
>>> 
>>> The legacy PCI case seems even stranger.
>
> Why? If a PCI device isn't connected to a PCIe bridge, it should be a legacy bridge.

I am not deep in the IOV specification at the moment.  I am mostly
wondering why we pick the parts we pick to verify.  I recall
bus and devfn being on the pcie packets so that makes sense.

Why would we ever want to do something different?  Does a pcie to pci
bridge do something different in it's translation?

Eric
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Han, Weidong May 21, 2009, 1:37 p.m. UTC | #12
Eric W. Biederman wrote:
> "Han, Weidong" <weidong.han@intel.com> writes:
> 
>> Ingo Molnar wrote:
>>> * Eric W. Biederman <ebiederm@xmission.com> wrote:
>>> 
>>>> Not finding an upstream pcie_bridge and then concluding we are a
>>>> pcie device seems bogus. 
>>>> 
>> 
>> If device is pcie, pci_find_upstream_pcie_bridge() will return
>> NULL. For root complex integrated device, it won't find upstream
>> bridge, and also return NULL. What's more, pcie device and root
>> complex integrated device will be handled as the same way to set
>> sid, so I mix them here. But it also returns NULL for busted
>> hardware. I think no parent bus can be considered Root Complex
>> integrated device, right? If so, I think can handle it as follows:
> 
> I have problems with pci_find_upstream_pcie_bridge.  The
> name is actively misleading about what that function does.
> Returning a pci_to_pci bridge is strongly counter intuitive
> give that name.
> 
> Can we change it to pci_find_upstream_pcie_to_pci_bridge.
> Returning NULL in all cases when there is not an upstream
> pcie_to_pci bridge.

pci_find_upstream_pcie_bridge returns upstream pcie-to-pci/pcix bridge or legacy pci bridge for a pci device. pci_find_upstream_bridge may be more suitable.

> 
> For the set_sid case that is ideal.  For the other cases in
> intel-iommu.c it may be a problem.  Is it even possible to have a
> genuine pci device not behind a pcie to pci bridge on an intel
> chipset with this iommu?
> 

I think it's little possible. But coincide to VT-d spec, we need to handle differently for devices behind pcie-to-pci/pcix bridge and behind legacy pci bridge.

> 
>> 	...
>> 	if (dev->is_pcie || !dev->bus->parent) {
>> 		set_irte_sid(...);
>> 		return 0;
>> 	}
>> 
>> 	bridge = pci_find_upstream_pcie_bridge(dev);
>> 	if (bridge) {
>> 		if (bridge->is_pcie) /* PCIE-to-PCI/PCIx bridge */
>> 		set_irte_sid(...); else /* legacy PCI bridge */
>> 			set_irte_sid(..);
>> 	}
>> 
>> 	return 0;
>> 
>>>> Why if we do have an upstream pcie bridge do we only want to do a
>>>> bus range verification instead of checking just for the bus
>>>>> devfn?
>>>> 
>>>> The legacy PCI case seems even stranger.
>> 
>> Why? If a PCI device isn't connected to a PCIe bridge, it should be
>> a legacy bridge. 
> 
> I am not deep in the IOV specification at the moment.  I am mostly
> wondering why we pick the parts we pick to verify.  I recall
> bus and devfn being on the pcie packets so that makes sense.
> 
> Why would we ever want to do something different?  Does a pcie to pci
> bridge do something different in it's translation?

source-id is different between devices pcie-to-pci/pcix bridge and behind legacy pci bridge. Thus VT-d needs to handle it differently.

The pcie-to-pci/pcix bridges may generate a different requester-id and tag combination in some instances for transactions forwarded to the bridge's PCI Express interface. The action of replacing the original transaction's requester-id with one assigned by the bridge is generally referred to as taking 'ownership' of the transaction. If the bridge generates a new requester-id for a transaction forwarded from the secondary interface to the primary interface, the bridge assigns the PCI Express requester-id using the secondary interface's bus number, and sets both the device number and function number fields to zero. Refer to the PCI Express-to-PCI/PCI-X bridge specifications for more details.

For devices behind conventional PCI bridges, the source-id in the DMA requests is the requester-id of the bridge device.

Regards,
Weidong


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617..3d10c68 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1559,6 +1559,9 @@  int setup_ioapic_entry(int apic_id, int irq,
 		irte.vector = vector;
 		irte.dest_id = IRTE_DEST(destination);
 
+		/* Set source-id of interrupt request */
+		set_ioapic_sid(&irte, apic_id);
+
 		modify_irte(irq, &irte);
 
 		ir_entry->index2 = (index >> 15) & 0x1;
@@ -3329,6 +3332,9 @@  static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 		irte.vector = cfg->vector;
 		irte.dest_id = IRTE_DEST(dest);
 
+		/* Set source-id of interrupt request */
+		set_msi_sid(&irte, pdev);
+
 		modify_irte(irq, &irte);
 
 		msg->address_hi = MSI_ADDR_BASE_HI;
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 946e170..9ef7b0d 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -10,6 +10,8 @@ 
 #include <linux/intel-iommu.h>
 #include "intr_remapping.h"
 #include <acpi/acpi.h>
+#include <asm/pci-direct.h>
+#include "pci.h"
 
 static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
 static int ir_ioapic_num;
@@ -405,6 +407,61 @@  int free_irte(int irq)
 	return rc;
 }
 
+int set_ioapic_sid(struct irte *irte, int apic)
+{
+	int i;
+	u16 sid = 0;
+
+	if (!irte)
+		return -1;
+
+	for (i = 0; i < MAX_IO_APICS; i++)
+		if (ir_ioapic[i].id == apic) {
+			sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn;
+			break;
+		}
+
+	if (sid == 0) {
+		printk(KERN_WARNING "Failed to set source-id of "
+		       "I/O APIC (%d), because it is not under "
+		       "any DRHD\n", apic);
+		return -1;
+	}
+
+	irte->svt = 1; /* requestor ID verification SID/SQ */
+	irte->sq = 0;  /* comparing all 16-bit of SID */
+	irte->sid = sid;
+
+	return 0;
+}
+
+int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+	struct pci_dev *tmp;
+
+	if (!irte || !dev)
+		return -1;
+
+	tmp = pci_find_upstream_pcie_bridge(dev);
+	if (!tmp) { /* PCIE device or integrated PCI device */
+		irte->svt = 1; /* verify requestor ID verification SID/SQ */
+		irte->sq = 0;  /* comparing all 16-bit of SID */
+		irte->sid = (dev->bus->number << 8) | dev->devfn;
+		return 0;
+	}
+
+	if (tmp->is_pcie) { /* this is a PCIE-to-PCI/PCIX bridge */
+		irte->svt = 2; /* verify request ID verification SID */
+		irte->sid = (tmp->bus->number << 8) | dev->bus->number;
+	} else { /* this is a legacy PCI bridge */
+		irte->svt = 1; /* verify requestor ID verification SID/SQ */
+		irte->sq = 0;  /* comparing all 16-bit of SID */
+		irte->sid = (tmp->bus->number << 8) | tmp->devfn;
+	}
+
+	return 0;
+}
+
 static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
 {
 	u64 addr;
@@ -610,6 +667,35 @@  error:
 	return -1;
 }
 
+static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
+				      struct intel_iommu *iommu)
+{
+	struct acpi_dmar_pci_path *path;
+	u8 bus;
+	int count;
+
+	bus = scope->bus;
+	path = (struct acpi_dmar_pci_path *)(scope + 1);
+	count = (scope->length - sizeof(struct acpi_dmar_device_scope))
+		/ sizeof(struct acpi_dmar_pci_path);
+
+	while (--count > 0) {
+		/*
+		 * Access PCI directly due to the PCI
+		 * subsystem isn't initialized yet.
+		 */
+		bus = read_pci_config_byte(bus, path->dev, path->fn,
+					   PCI_SECONDARY_BUS);
+		path++;
+	}
+
+	ir_ioapic[ir_ioapic_num].bus = bus;
+	ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn);
+	ir_ioapic[ir_ioapic_num].iommu = iommu;
+	ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;
+	ir_ioapic_num++;
+}
+
 static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
 				 struct intel_iommu *iommu)
 {
@@ -634,9 +720,7 @@  static int ir_parse_ioapic_scope(struct acpi_dmar_header *header,
 			       " 0x%Lx\n", scope->enumeration_id,
 			       drhd->address);
 
-			ir_ioapic[ir_ioapic_num].iommu = iommu;
-			ir_ioapic[ir_ioapic_num].id = scope->enumeration_id;
-			ir_ioapic_num++;
+			ir_parse_one_ioapic_scope(scope, iommu);
 		}
 		start += scope->length;
 	}
diff --git a/drivers/pci/intr_remapping.h b/drivers/pci/intr_remapping.h
index ca48f0d..dd35780 100644
--- a/drivers/pci/intr_remapping.h
+++ b/drivers/pci/intr_remapping.h
@@ -3,6 +3,8 @@ 
 struct ioapic_scope {
 	struct intel_iommu *iommu;
 	unsigned int id;
+	u8 bus;			/* PCI bus number */
+	u8 devfn;		/* PCI devfn number */
 };
 
 #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e397dc3..7d27284 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -125,6 +125,8 @@  extern int free_irte(int irq);
 extern int irq_remapped(int irq);
 extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
 extern struct intel_iommu *map_ioapic_to_ir(int apic);
+extern int set_ioapic_sid(struct irte *irte, int apic);
+extern int set_msi_sid(struct irte *irte, struct pci_dev *dev);
 #else
 static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 {
@@ -155,6 +157,15 @@  static inline struct intel_iommu *map_ioapic_to_ir(int apic)
 {
 	return NULL;
 }
+static inline int set_ioapic_sid(struct irte *irte, int apic)
+{
+	return 0;
+}
+static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev)
+{
+	return 0;
+}
+
 #define irq_remapped(irq)		(0)
 #define enable_intr_remapping(mode)	(-1)
 #define intr_remapping_enabled		(0)