diff mbox

[v10,4/7] PCI: add SR-IOV API for Physical Function driver

Message ID 1235112888-9524-5-git-send-email-yu.zhao@intel.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Yu Zhao Feb. 20, 2009, 6:54 a.m. UTC
Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
 drivers/pci/iov.c   |  348 +++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/pci.h   |    3 +
 include/linux/pci.h |   14 ++
 3 files changed, 365 insertions(+), 0 deletions(-)

Comments

Matthew Wilcox March 6, 2009, 8:37 p.m. UTC | #1
On Fri, Feb 20, 2009 at 02:54:45PM +0800, Yu Zhao wrote:
> +	virtfn->sysdata = dev->bus->sysdata;
> +	virtfn->dev.parent = dev->dev.parent;
> +	virtfn->dev.bus = dev->dev.bus;
> +	virtfn->devfn = devfn;
> +	virtfn->hdr_type = PCI_HEADER_TYPE_NORMAL;
> +	virtfn->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
> +	virtfn->error_state = pci_channel_io_normal;
> +	virtfn->current_state = PCI_UNKNOWN;
> +	virtfn->is_pcie = 1;
> +	virtfn->pcie_type = PCI_EXP_TYPE_ENDPOINT;
> +	virtfn->dma_mask = 0xffffffff;
> +	virtfn->vendor = dev->vendor;
> +	virtfn->subsystem_vendor = dev->subsystem_vendor;
> +	virtfn->class = dev->class;

There seems to be a certain amount of commonality between this and
pci_scan_device().  Have you considered trying to make a common helper
function, or does it not work out well?

> +	pci_device_add(virtfn, virtfn->bus);

Greg is probably going to ding you here for adding the device, then
creating the symlinks.  I believe it's now best practice to create the
symlinks first, so there's no window where userspace can get confused.

> +	mutex_unlock(&iov->pdev->sriov->lock);

I question the existance of this mutex now.  What's it protecting?

Aren't we going to be implicitly protected by virtue of the Physical
Function device driver being the only one calling this function, and the
driver will be calling it from the ->probe routine which is not called
simultaneously for the same device.

> +	virtfn->physfn = pci_dev_get(dev);
> +
> +	rc = pci_bus_add_device(virtfn);
> +	if (rc)
> +		goto failed1;
> +	sprintf(buf, "%d", id);

%u, perhaps?  And maybe 'id' should always be unsigned?  Just a thought.

> +	rc = sysfs_create_link(&iov->dev.kobj, &virtfn->dev.kobj, buf);
> +	if (rc)
> +		goto failed1;
> +	rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
> +	if (rc)
> +		goto failed2;

I'm glad to see these symlinks documented in later patches!

> +	nres = 0;
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> +		res = dev->resource + PCI_SRIOV_RESOURCES + i;
> +		if (!res->parent)
> +			continue;
> +		nres++;
> +	}

Can't this be written more simply as:

	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
		res = dev->resource + PCI_SRIOV_RESOURCES + i;
		if (res->parent)
			nres++;
	}
?

> +	if (nres != iov->nres) {
> +		dev_err(&dev->dev, "no enough MMIO for SR-IOV\n");
> +		return -ENOMEM;
> +	}

Randy, can you help us out with better wording here?

> +		dev_err(&dev->dev, "no enough bus range for SR-IOV\n");

and here.

> +	if (iov->link != dev->devfn) {
> +		rc = -ENODEV;
> +		list_for_each_entry(link, &dev->bus->devices, bus_list) {
> +			if (link->sriov && link->devfn == iov->link)
> +				rc = sysfs_create_link(&iov->dev.kobj,
> +						&link->dev.kobj, "dep_link");

I skipped to the end and read patch 7/7 and I still don't understand
what dep_link is for.  Can you explain please?  In particular, how is it
different from physfn?
Randy Dunlap March 6, 2009, 9:48 p.m. UTC | #2
Matthew Wilcox wrote:
> On Fri, Feb 20, 2009 at 02:54:45PM +0800, Yu Zhao wrote:
> 
>> +	if (nres != iov->nres) {
>> +		dev_err(&dev->dev, "no enough MMIO for SR-IOV\n");
>> +		return -ENOMEM;
>> +	}

		"not enough MMIO BARs for SR-IOV"
	or
		"not enough MMIO resources for SR-IOV"
	or
		"too few MMIO BARs for SR-IOV"
?

> Randy, can you help us out with better wording here?
> 
>> +		dev_err(&dev->dev, "no enough bus range for SR-IOV\n");
> 
> and here.

		"SR-IOV: bus number too large"
	or
		"SR-IOV: bus number out of range"
	or
		"SR-IOV: cannot allocate valid bus number"
?

>> +	if (iov->link != dev->devfn) {
>> +		rc = -ENODEV;
>> +		list_for_each_entry(link, &dev->bus->devices, bus_list) {
>> +			if (link->sriov && link->devfn == iov->link)
>> +				rc = sysfs_create_link(&iov->dev.kobj,
>> +						&link->dev.kobj, "dep_link");

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH March 7, 2009, 2:40 a.m. UTC | #3
On Fri, Mar 06, 2009 at 01:37:18PM -0700, Matthew Wilcox wrote:
> On Fri, Feb 20, 2009 at 02:54:45PM +0800, Yu Zhao wrote:
> > +	virtfn->sysdata = dev->bus->sysdata;
> > +	virtfn->dev.parent = dev->dev.parent;
> > +	virtfn->dev.bus = dev->dev.bus;
> > +	virtfn->devfn = devfn;
> > +	virtfn->hdr_type = PCI_HEADER_TYPE_NORMAL;
> > +	virtfn->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
> > +	virtfn->error_state = pci_channel_io_normal;
> > +	virtfn->current_state = PCI_UNKNOWN;
> > +	virtfn->is_pcie = 1;
> > +	virtfn->pcie_type = PCI_EXP_TYPE_ENDPOINT;
> > +	virtfn->dma_mask = 0xffffffff;
> > +	virtfn->vendor = dev->vendor;
> > +	virtfn->subsystem_vendor = dev->subsystem_vendor;
> > +	virtfn->class = dev->class;
> 
> There seems to be a certain amount of commonality between this and
> pci_scan_device().  Have you considered trying to make a common helper
> function, or does it not work out well?
> 
> > +	pci_device_add(virtfn, virtfn->bus);
> 
> Greg is probably going to ding you here for adding the device, then
> creating the symlinks.  I believe it's now best practice to create the
> symlinks first, so there's no window where userspace can get confused.

If the uevent gets sent before the symlinks are created, it's a bug.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yu Zhao March 9, 2009, 8:25 a.m. UTC | #4
On Sat, Mar 07, 2009 at 04:37:18AM +0800, Matthew Wilcox wrote:
> On Fri, Feb 20, 2009 at 02:54:45PM +0800, Yu Zhao wrote:
> > +	virtfn->sysdata = dev->bus->sysdata;
> > +	virtfn->dev.parent = dev->dev.parent;
> > +	virtfn->dev.bus = dev->dev.bus;
> > +	virtfn->devfn = devfn;
> > +	virtfn->hdr_type = PCI_HEADER_TYPE_NORMAL;
> > +	virtfn->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
> > +	virtfn->error_state = pci_channel_io_normal;
> > +	virtfn->current_state = PCI_UNKNOWN;
> > +	virtfn->is_pcie = 1;
> > +	virtfn->pcie_type = PCI_EXP_TYPE_ENDPOINT;
> > +	virtfn->dma_mask = 0xffffffff;
> > +	virtfn->vendor = dev->vendor;
> > +	virtfn->subsystem_vendor = dev->subsystem_vendor;
> > +	virtfn->class = dev->class;
> 
> There seems to be a certain amount of commonality between this and
> pci_scan_device().  Have you considered trying to make a common helper
> function, or does it not work out well?

It's doable. Will enhance the pci_setup_device and use it to setup the VF.

> > +	pci_device_add(virtfn, virtfn->bus);
> 
> Greg is probably going to ding you here for adding the device, then
> creating the symlinks.  I believe it's now best practice to create the
> symlinks first, so there's no window where userspace can get confused.

Yes, but unfortunately we can't create links before adding a device.
I double checked device_add(), there is no place for those links to be
created before it sends uevent. So for now, we have to trigger another
uevent for those links.

> > +	mutex_unlock(&iov->pdev->sriov->lock);
> 
> I question the existance of this mutex now.  What's it protecting?
> 
> Aren't we going to be implicitly protected by virtue of the Physical
> Function device driver being the only one calling this function, and the
> driver will be calling it from the ->probe routine which is not called
> simultaneously for the same device.

The PF driver patches I listed before support dynamical enabling/disabling
of the SR-IOV through sysfs interface. So we have to protect the VF bus
allocation as I explained before.

> > +	virtfn->physfn = pci_dev_get(dev);
> > +
> > +	rc = pci_bus_add_device(virtfn);
> > +	if (rc)
> > +		goto failed1;
> > +	sprintf(buf, "%d", id);
> 
> %u, perhaps?  And maybe 'id' should always be unsigned?  Just a thought.

Yes, will replace %d to %u.

> > +	rc = sysfs_create_link(&iov->dev.kobj, &virtfn->dev.kobj, buf);
> > +	if (rc)
> > +		goto failed1;
> > +	rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
> > +	if (rc)
> > +		goto failed2;
> 
> I'm glad to see these symlinks documented in later patches!
> 
> > +	nres = 0;
> > +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> > +		res = dev->resource + PCI_SRIOV_RESOURCES + i;
> > +		if (!res->parent)
> > +			continue;
> > +		nres++;
> > +	}
> 
> Can't this be written more simply as:
> 
> 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> 		res = dev->resource + PCI_SRIOV_RESOURCES + i;
> 		if (res->parent)
> 			nres++;
> 	}

Yes, will do

> ?
> 
> > +	if (nres != iov->nres) {
> > +		dev_err(&dev->dev, "no enough MMIO for SR-IOV\n");
> > +		return -ENOMEM;
> > +	}
> 
> Randy, can you help us out with better wording here?
> 
> > +		dev_err(&dev->dev, "no enough bus range for SR-IOV\n");
> 
> and here.
> 
> > +	if (iov->link != dev->devfn) {
> > +		rc = -ENODEV;
> > +		list_for_each_entry(link, &dev->bus->devices, bus_list) {
> > +			if (link->sriov && link->devfn == iov->link)
> > +				rc = sysfs_create_link(&iov->dev.kobj,
> > +						&link->dev.kobj, "dep_link");
> 
> I skipped to the end and read patch 7/7 and I still don't understand
> what dep_link is for.  Can you explain please?  In particular, how is it
> different from physfn?

It's defined by spec as:

3.3.8. Function Dependency Link (12h)
The programming model for a Device may have vendor specific dependencies
between sets of Functions. The Function Dependency Link field is used to
describe these dependencies. This field describes dependencies between PFs.
VF dependencies are the same as the dependencies of their associated PFs.
If a PF is independent from other PFs of a Device, this field shall
contain its own Function Number. If a PF is dependent on other PFs of a
Device, this field shall contain the Function Number of the next PF in
the same Function Dependency List. The last PF in a Function Dependency
List shall contain the Function Number of the first PF in the Function
Dependency List. If PF p and PF q are in the same Function Dependency
List, than any SI that is assigned VF p,n shall also be assigned to VF q,n.

Thanks,
Yu
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yu Zhao March 9, 2009, 8:29 a.m. UTC | #5
Thanks a lot, Randy!

On Sat, Mar 07, 2009 at 05:48:33AM +0800, Randy Dunlap wrote:
> Matthew Wilcox wrote:
> > On Fri, Feb 20, 2009 at 02:54:45PM +0800, Yu Zhao wrote:
> > 
> >> +	if (nres != iov->nres) {
> >> +		dev_err(&dev->dev, "no enough MMIO for SR-IOV\n");
> >> +		return -ENOMEM;
> >> +	}
> 
> 		"not enough MMIO BARs for SR-IOV"
> 	or
> 		"not enough MMIO resources for SR-IOV"
> 	or
> 		"too few MMIO BARs for SR-IOV"
> ?
> 
> > Randy, can you help us out with better wording here?
> > 
> >> +		dev_err(&dev->dev, "no enough bus range for SR-IOV\n");
> > 
> > and here.
> 
> 		"SR-IOV: bus number too large"
> 	or
> 		"SR-IOV: bus number out of range"
> 	or
> 		"SR-IOV: cannot allocate valid bus number"
> ?
> 
> >> +	if (iov->link != dev->devfn) {
> >> +		rc = -ENODEV;
> >> +		list_for_each_entry(link, &dev->bus->devices, bus_list) {
> >> +			if (link->sriov && link->devfn == iov->link)
> >> +				rc = sysfs_create_link(&iov->dev.kobj,
> >> +						&link->dev.kobj, "dep_link");
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH March 9, 2009, 7:39 p.m. UTC | #6
On Mon, Mar 09, 2009 at 04:25:05PM +0800, Yu Zhao wrote:
> > > +	pci_device_add(virtfn, virtfn->bus);
> > 
> > Greg is probably going to ding you here for adding the device, then
> > creating the symlinks.  I believe it's now best practice to create the
> > symlinks first, so there's no window where userspace can get confused.
> 
> Yes, but unfortunately we can't create links before adding a device.
> I double checked device_add(), there is no place for those links to be
> created before it sends uevent. So for now, we have to trigger another
> uevent for those links.

What exactly are you trying to do with a symlink here that you need to
do it this way?  I vaguely remember you mentioning this in the past, but
I thought you had dropped the symlinks after our conversation about this
very problem.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yu Zhao March 10, 2009, 1:37 a.m. UTC | #7
On Tue, Mar 10, 2009 at 03:39:01AM +0800, Greg KH wrote:
> On Mon, Mar 09, 2009 at 04:25:05PM +0800, Yu Zhao wrote:
> > > > +	pci_device_add(virtfn, virtfn->bus);
> > > 
> > > Greg is probably going to ding you here for adding the device, then
> > > creating the symlinks.  I believe it's now best practice to create the
> > > symlinks first, so there's no window where userspace can get confused.
> > 
> > Yes, but unfortunately we can't create links before adding a device.
> > I double checked device_add(), there is no place for those links to be
> > created before it sends uevent. So for now, we have to trigger another
> > uevent for those links.
> 
> What exactly are you trying to do with a symlink here that you need to
> do it this way?  I vaguely remember you mentioning this in the past, but
> I thought you had dropped the symlinks after our conversation about this
> very problem.

I'd like to create some symlinks to reflect the relationship between
Physical Function and its associated Virtual Functions. The Physical
Function is like a master device that controls the allocation of its
Virtual Functions and owns the device physical resource. The Virtual
Functions are like slave devices of the Physical Function. For example,
if 01:00.0 is a Physical Function and 02:00.0 is a Virtual Function
associated with 01:00.0. Then the symlinks (virtfnN and physfn) would
look like:

  $ ls -l /sys/bus/pci/devices/0000:01:00.0/
  ...
  ...  virtfn0 -> ../0000:02:00.0
  ...  virtfn1 -> ../0000:02:00.1
  ...  virtfn2 -> ../0000:02:00.2
  ...

  $ ls -l /sys/bus/pci/devices/0000:02:00.0/
  ...
  ... physfn -> ../0000:01:00.0
  ...

This is very useful for userspace applications, both KVM and Xen need
to know this kind of relationship so they can request the permission
from a Physical Function before using its associated Virtual Functions.

Thanks,
Yu
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH March 11, 2009, 4:34 a.m. UTC | #8
On Tue, Mar 10, 2009 at 09:37:53AM +0800, Yu Zhao wrote:
> On Tue, Mar 10, 2009 at 03:39:01AM +0800, Greg KH wrote:
> > On Mon, Mar 09, 2009 at 04:25:05PM +0800, Yu Zhao wrote:
> > > > > +	pci_device_add(virtfn, virtfn->bus);
> > > > 
> > > > Greg is probably going to ding you here for adding the device, then
> > > > creating the symlinks.  I believe it's now best practice to create the
> > > > symlinks first, so there's no window where userspace can get confused.
> > > 
> > > Yes, but unfortunately we can't create links before adding a device.
> > > I double checked device_add(), there is no place for those links to be
> > > created before it sends uevent. So for now, we have to trigger another
> > > uevent for those links.
> > 
> > What exactly are you trying to do with a symlink here that you need to
> > do it this way?  I vaguely remember you mentioning this in the past, but
> > I thought you had dropped the symlinks after our conversation about this
> > very problem.
> 
> I'd like to create some symlinks to reflect the relationship between
> Physical Function and its associated Virtual Functions. The Physical
> Function is like a master device that controls the allocation of its
> Virtual Functions and owns the device physical resource. The Virtual
> Functions are like slave devices of the Physical Function. For example,
> if 01:00.0 is a Physical Function and 02:00.0 is a Virtual Function
> associated with 01:00.0. Then the symlinks (virtfnN and physfn) would
> look like:
> 
>   $ ls -l /sys/bus/pci/devices/0000:01:00.0/
>   ...
>   ...  virtfn0 -> ../0000:02:00.0
>   ...  virtfn1 -> ../0000:02:00.1
>   ...  virtfn2 -> ../0000:02:00.2
>   ...
> 
>   $ ls -l /sys/bus/pci/devices/0000:02:00.0/
>   ...
>   ... physfn -> ../0000:01:00.0
>   ...
> 
> This is very useful for userspace applications, both KVM and Xen need
> to know this kind of relationship so they can request the permission
> from a Physical Function before using its associated Virtual Functions.

Ok, but then make sure you never rely on a udev rule or notifier to see
these symlinks when the device is added to the kernel, as there will be
a nice race condition there :)

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 0b80437..8096fc9 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -13,6 +13,8 @@ 
 #include <linux/delay.h>
 #include "pci.h"
 
+#define VIRTFN_ID_LEN	8
+
 
 static inline void virtfn_bdf(struct pci_dev *dev, int id, u8 *busnr, u8 *devfn)
 {
@@ -24,6 +26,319 @@  static inline void virtfn_bdf(struct pci_dev *dev, int id, u8 *busnr, u8 *devfn)
 	*devfn = bdf & 0xff;
 }
 
+static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
+{
+	int rc;
+	struct pci_bus *child;
+
+	if (bus->number == busnr)
+		return bus;
+
+	child = pci_find_bus(pci_domain_nr(bus), busnr);
+	if (child)
+		return child;
+
+	child = pci_add_new_bus(bus, NULL, busnr);
+	if (!child)
+		return NULL;
+
+	child->subordinate = busnr;
+	child->dev.parent = bus->bridge;
+	rc = pci_bus_add_child(child);
+	if (rc) {
+		pci_remove_bus(child);
+		return NULL;
+	}
+
+	return child;
+}
+
+static void virtfn_remove_bus(struct pci_bus *bus, int busnr)
+{
+	struct pci_bus *child;
+
+	if (bus->number == busnr)
+		return;
+
+	child = pci_find_bus(pci_domain_nr(bus), busnr);
+	BUG_ON(!child);
+
+	if (list_empty(&child->devices))
+		pci_remove_bus(child);
+}
+
+static int virtfn_add(struct pci_dev *dev, int id, int reset)
+{
+	int i;
+	int rc;
+	u64 size;
+	u8 busnr, devfn;
+	char buf[VIRTFN_ID_LEN];
+	struct pci_dev *virtfn;
+	struct resource *res;
+	struct pci_sriov *iov = dev->sriov;
+
+	virtfn = alloc_pci_dev();
+	if (!virtfn)
+		return -ENOMEM;
+
+	virtfn_bdf(dev, id, &busnr, &devfn);
+	mutex_lock(&iov->pdev->sriov->lock);
+	virtfn->bus = virtfn_add_bus(dev->bus, busnr);
+	if (!virtfn->bus) {
+		kfree(virtfn);
+		mutex_unlock(&iov->pdev->sriov->lock);
+		return -ENOMEM;
+	}
+
+	virtfn->sysdata = dev->bus->sysdata;
+	virtfn->dev.parent = dev->dev.parent;
+	virtfn->dev.bus = dev->dev.bus;
+	virtfn->devfn = devfn;
+	virtfn->hdr_type = PCI_HEADER_TYPE_NORMAL;
+	virtfn->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
+	virtfn->error_state = pci_channel_io_normal;
+	virtfn->current_state = PCI_UNKNOWN;
+	virtfn->is_pcie = 1;
+	virtfn->pcie_type = PCI_EXP_TYPE_ENDPOINT;
+	virtfn->dma_mask = 0xffffffff;
+	virtfn->vendor = dev->vendor;
+	virtfn->subsystem_vendor = dev->subsystem_vendor;
+	virtfn->class = dev->class;
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
+	pci_read_config_byte(virtfn, PCI_REVISION_ID, &virtfn->revision);
+	pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
+			     &virtfn->subsystem_device);
+
+	dev_set_name(&virtfn->dev, "%04x:%02x:%02x.%d",
+		     pci_domain_nr(virtfn->bus), busnr,
+		     PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_SRIOV_RESOURCES + i;
+		if (!res->parent)
+			continue;
+		virtfn->resource[i].name = pci_name(virtfn);
+		virtfn->resource[i].flags = res->flags;
+		size = resource_size(res);
+		do_div(size, iov->total);
+		virtfn->resource[i].start = res->start + size * id;
+		virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
+		rc = request_resource(res, &virtfn->resource[i]);
+		BUG_ON(rc);
+	}
+
+	if (reset)
+		pci_execute_reset_function(virtfn);
+
+	pci_device_add(virtfn, virtfn->bus);
+	mutex_unlock(&iov->pdev->sriov->lock);
+
+	virtfn->physfn = pci_dev_get(dev);
+
+	rc = pci_bus_add_device(virtfn);
+	if (rc)
+		goto failed1;
+	sprintf(buf, "%d", id);
+	rc = sysfs_create_link(&iov->dev.kobj, &virtfn->dev.kobj, buf);
+	if (rc)
+		goto failed1;
+	rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
+	if (rc)
+		goto failed2;
+
+	kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
+
+	return 0;
+
+failed2:
+	sysfs_remove_link(&iov->dev.kobj, buf);
+failed1:
+	pci_dev_put(dev);
+	mutex_lock(&iov->pdev->sriov->lock);
+	pci_remove_bus_device(virtfn);
+	virtfn_remove_bus(dev->bus, busnr);
+	mutex_unlock(&iov->pdev->sriov->lock);
+
+	return rc;
+}
+
+static void virtfn_remove(struct pci_dev *dev, int id, int reset)
+{
+	u8 busnr, devfn;
+	char buf[VIRTFN_ID_LEN];
+	struct pci_bus *bus;
+	struct pci_dev *virtfn;
+	struct pci_sriov *iov = dev->sriov;
+
+	virtfn_bdf(dev, id, &busnr, &devfn);
+	bus = pci_find_bus(pci_domain_nr(dev->bus), busnr);
+	if (!bus)
+		return;
+
+	virtfn = pci_get_slot(bus, devfn);
+	if (!virtfn)
+		return;
+
+	pci_dev_put(virtfn);
+
+	if (reset) {
+		device_release_driver(&virtfn->dev);
+		pci_execute_reset_function(virtfn);
+	}
+
+	sprintf(buf, "%d", id);
+	sysfs_remove_link(&iov->dev.kobj, buf);
+	sysfs_remove_link(&virtfn->dev.kobj, "physfn");
+
+	mutex_lock(&iov->pdev->sriov->lock);
+	pci_remove_bus_device(virtfn);
+	virtfn_remove_bus(dev->bus, busnr);
+	mutex_unlock(&iov->pdev->sriov->lock);
+
+	pci_dev_put(dev);
+}
+
+static void sriov_release_dev(struct device *dev)
+{
+	struct pci_sriov *iov = container_of(dev, struct pci_sriov, dev);
+
+	iov->nr_virtfn = 0;
+}
+
+static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
+{
+	int rc;
+	int i, j;
+	int nres;
+	u8 busnr, devfn;
+	u16 offset, stride, initial;
+	struct resource *res;
+	struct pci_dev *link;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!nr_virtfn)
+		return 0;
+
+	if (iov->nr_virtfn)
+		return -EINVAL;
+
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
+	if (initial > iov->total ||
+	    (!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total)))
+		return -EIO;
+
+	if (nr_virtfn < 0 || nr_virtfn > iov->total ||
+	    (!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
+		return -EINVAL;
+
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &offset);
+	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &stride);
+	if (!offset || (nr_virtfn > 1 && !stride))
+		return -EIO;
+
+	nres = 0;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = dev->resource + PCI_SRIOV_RESOURCES + i;
+		if (!res->parent)
+			continue;
+		nres++;
+	}
+	if (nres != iov->nres) {
+		dev_err(&dev->dev, "no enough MMIO for SR-IOV\n");
+		return -ENOMEM;
+	}
+
+	iov->offset = offset;
+	iov->stride = stride;
+
+	virtfn_bdf(dev, nr_virtfn - 1, &busnr, &devfn);
+	if (busnr > dev->bus->subordinate) {
+		dev_err(&dev->dev, "no enough bus range for SR-IOV\n");
+		return -ENOMEM;
+	}
+
+	memset(&iov->dev, 0, sizeof(iov->dev));
+	strcpy(iov->dev.bus_id, "virtfn");
+	iov->dev.parent = &dev->dev;
+	iov->dev.release = sriov_release_dev;
+	rc = device_register(&iov->dev);
+	if (rc)
+		return rc;
+
+	if (iov->link != dev->devfn) {
+		rc = -ENODEV;
+		list_for_each_entry(link, &dev->bus->devices, bus_list) {
+			if (link->sriov && link->devfn == iov->link)
+				rc = sysfs_create_link(&iov->dev.kobj,
+						&link->dev.kobj, "dep_link");
+		}
+		if (rc)
+			goto failed1;
+	}
+
+	iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	msleep(100);
+	pci_unblock_user_cfg_access(dev);
+
+	iov->initial = initial;
+	if (nr_virtfn < initial)
+		initial = nr_virtfn;
+
+	for (i = 0; i < initial; i++) {
+		rc = virtfn_add(dev, i, 0);
+		if (rc)
+			goto failed2;
+	}
+
+	kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
+	iov->nr_virtfn = nr_virtfn;
+
+	return 0;
+
+failed2:
+	for (j = 0; j < i; j++)
+		virtfn_remove(dev, j, 0);
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	ssleep(1);
+	pci_unblock_user_cfg_access(dev);
+
+	if (iov->link != dev->devfn)
+		sysfs_remove_link(&iov->dev.kobj, "dep_link");
+failed1:
+	device_unregister(&iov->dev);
+
+	return rc;
+}
+
+static void sriov_disable(struct pci_dev *dev)
+{
+	int i;
+	struct pci_sriov *iov = dev->sriov;
+
+	if (!iov->nr_virtfn)
+		return;
+
+	for (i = 0; i < iov->nr_virtfn; i++)
+		virtfn_remove(dev, i, 0);
+
+	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+	pci_block_user_cfg_access(dev);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	ssleep(1);
+	pci_unblock_user_cfg_access(dev);
+
+	if (iov->link != dev->devfn)
+		sysfs_remove_link(&iov->dev.kobj, "dep_link");
+	device_unregister(&iov->dev);
+}
+
 static int sriov_init(struct pci_dev *dev, int pos)
 {
 	int i;
@@ -129,6 +444,8 @@  failed:
 
 static void sriov_release(struct pci_dev *dev)
 {
+	BUG_ON(dev->sriov->nr_virtfn);
+
 	if (dev == dev->sriov->pdev)
 		mutex_destroy(&dev->sriov->lock);
 	else
@@ -152,6 +469,7 @@  static void sriov_restore_state(struct pci_dev *dev)
 		pci_update_resource(dev, i);
 
 	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
+	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->nr_virtfn);
 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
 	if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
 		msleep(100);
@@ -242,3 +560,33 @@  int pci_iov_bus_range(struct pci_bus *bus)
 
 	return max ? max - bus->number : 0;
 }
+
+/**
+ * pci_enable_sriov - enable the SR-IOV capability
+ * @dev: the PCI device
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+	might_sleep();
+
+	if (!dev->sriov)
+		return -ENODEV;
+
+	return sriov_enable(dev, nr_virtfn);
+}
+EXPORT_SYMBOL_GPL(pci_enable_sriov);
+
+/**
+ * pci_disable_sriov - disable the SR-IOV capability
+ * @dev: the PCI device
+ */
+void pci_disable_sriov(struct pci_dev *dev)
+{
+	might_sleep();
+
+	if (dev->sriov)
+		sriov_disable(dev);
+}
+EXPORT_SYMBOL_GPL(pci_disable_sriov);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2cf32f5..9bbf868 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -202,6 +202,8 @@  struct pci_sriov {
 	u32 cap;		/* SR-IOV Capabilities */
 	u16 ctrl;		/* SR-IOV Control */
 	u16 total;		/* total VFs associated with the PF */
+	u16 initial;		/* initial VFs associated with the PF */
+	u16 nr_virtfn;		/* number of VFs available */
 	u16 offset;		/* first VF Routing ID offset */
 	u16 stride;		/* following VF stride */
 	u32 pgsz;		/* page size for BAR alignment */
@@ -209,6 +211,7 @@  struct pci_sriov {
 	struct pci_dev *pdev;	/* lowest numbered PF */
 	struct pci_dev *self;	/* this PF */
 	struct mutex lock;	/* lock for VF bus */
+	struct device dev;
 };
 
 #ifdef CONFIG_PCI_IOV
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f4d740e..3a24ff5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -278,6 +278,7 @@  struct pci_dev {
 #endif
 	struct pci_vpd *vpd;
 	struct pci_sriov *sriov;	/* SR-IOV capability related */
+	struct pci_dev *physfn;	/* Physical Function the device belongs to */
 };
 
 extern struct pci_dev *alloc_pci_dev(void);
@@ -1202,5 +1203,18 @@  int pci_ext_cfg_avail(struct pci_dev *dev);
 
 void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
 
+#ifdef CONFIG_PCI_IOV
+extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+extern void pci_disable_sriov(struct pci_dev *dev);
+#else
+static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+{
+	return -ENODEV;
+}
+static inline void pci_disable_sriov(struct pci_dev *dev)
+{
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */