diff mbox

[RFC,v7] PCI: Set PCI-E Max Payload Size on fabric

Message ID 1311193254-14164-1-git-send-email-mason@myri.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Jon Mason July 20, 2011, 8:20 p.m. UTC
On a given PCI-E fabric, each device, bridge, and root port can have a

Comments

jordan hargrave July 20, 2011, 10:58 p.m. UTC | #1
Looks good, just realized my patch was broken too! :(

On Wed, Jul 20, 2011 at 3:20 PM, Jon Mason <mason@myri.com> wrote:
> On a given PCI-E fabric, each device, bridge, and root port can have a
> different PCI-E maximum payload size.  There is a sizable performance
> boost for having the largest possible maximum payload size on each PCI-E
> device.  However, if improperly configured, fatal bus errors can occur.
> Thus, it is important to ensure that PCI-E payloads sends by a device
> are never larger than the MPS setting of all devices on the way to the
> destination.
>
> This can be achieved two ways:
>
> - A conservative approach is to use the smallest common denominator of
>  the entire tree below a root complex for every device on that fabric.
>
> This means for example that having a 128 bytes MPS USB controller on one
> leg of a switch will dramatically reduce performances of a video card or
> 10GE adapter on another leg of that same switch.
>
> It also means that any hierarchy supporting hotplug slots (including
> expresscard or thunderbolt I suppose, dbl check that) will have to be
> entirely clamped to 128 bytes since we cannot predict what will be
> plugged into those slots, and we cannot change the MPS on a "live"
> system.
>
> - A more optimal way is possible, if it falls within a couple of
>  constraints:
> * The top-level host bridge will never generate packets larger than the
>  smallest TLP (or if it can be controlled independently from its MPS at
>  least)
> * The device will never generate packets larger than MPS (which can be
>  configured via MRRS)
> * No support of direct PCI-E <-> PCI-E transfers between devices without
>  some additional code to specifically deal with that case
>
> Then we can use an approach that basically ignores downstream requests
> and focuses exclusively on upstream requests. In that case, all we need
> to care about is that a device MPS is no larger than its parent MPS,
> which allows us to keep all switches/bridges to the max MPS supported by
> their parent and eventually the PHB.
>
> In this case, your USB controller would no longer "starve" your 10GE
> Ethernet and your hotplug slots won't affect your global MPS.
> Additionally, the hotplugged devices themselves can be configured to a
> larger MPS up to the value configured in the hotplug bridge.
>
> To choose between the two available options, two PCI kernel boot args
> have been added to the PCI calls.  "pcie_bus_safe" will provide the
> former behavior, while "pcie_bus_perf" will perform the latter behavior.
> By default, the latter behavior is used.
>
> NOTE: due to the location of the enablement, each arch will need to add
> calls to this function.  This patch only enables x86.
>
> This patch includes a number of changes recommended by Benjamin
> Herrenschmidt.
>
> Signed-off-by: Jon Mason <mason@myri.com>
> ---
>  arch/x86/pci/acpi.c              |    9 +++
>  drivers/pci/hotplug/pcihp_slot.c |   45 +------------
>  drivers/pci/pci.c                |   67 +++++++++++++++++
>  drivers/pci/probe.c              |  145 ++++++++++++++++++++++++++++++++++++++
>  include/linux/pci.h              |   15 ++++-
>  5 files changed, 236 insertions(+), 45 deletions(-)
>
> diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
> index 0972315..4aeac46 100644
> --- a/arch/x86/pci/acpi.c
> +++ b/arch/x86/pci/acpi.c
> @@ -361,6 +361,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
>                }
>        }
>
> +       /* After the PCI-E bus has been walked and all devices discovered,
> +        * configure any settings of the fabric that might be necessary.
> +        */
> +       if (bus) {
> +               struct pci_bus *child;
> +               list_for_each_entry(child, &bus->children, node)
> +                       pcie_bus_configure_settings(child, child->self->pcie_mpss);
> +       }
> +
>        if (!bus)
>                kfree(sd);
>
> diff --git a/drivers/pci/hotplug/pcihp_slot.c b/drivers/pci/hotplug/pcihp_slot.c
> index 749fdf0..753b21a 100644
> --- a/drivers/pci/hotplug/pcihp_slot.c
> +++ b/drivers/pci/hotplug/pcihp_slot.c
> @@ -158,47 +158,6 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
>         */
>  }
>
> -/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */
> -static int pci_set_payload(struct pci_dev *dev)
> -{
> -       int pos, ppos;
> -       u16 pctl, psz;
> -       u16 dctl, dsz, dcap, dmax;
> -       struct pci_dev *parent;
> -
> -       parent = dev->bus->self;
> -       pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
> -       if (!pos)
> -               return 0;
> -
> -       /* Read Device MaxPayload capability and setting */
> -       pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl);
> -       pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap);
> -       dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
> -       dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD);
> -
> -       /* Read Parent MaxPayload setting */
> -       ppos = pci_find_capability(parent, PCI_CAP_ID_EXP);
> -       if (!ppos)
> -               return 0;
> -       pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl);
> -       psz = (pctl &  PCI_EXP_DEVCTL_PAYLOAD) >> 5;
> -
> -       /* If parent payload > device max payload -> error
> -        * If parent payload > device payload -> set speed
> -        * If parent payload <= device payload -> do nothing
> -        */
> -       if (psz > dmax)
> -               return -1;
> -       else if (psz > dsz) {
> -               dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz);
> -               pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
> -                                     (dctl & ~PCI_EXP_DEVCTL_PAYLOAD) +
> -                                     (psz << 5));
> -       }
> -       return 0;
> -}
> -
>  void pci_configure_slot(struct pci_dev *dev)
>  {
>        struct pci_dev *cdev;
> @@ -210,9 +169,7 @@ void pci_configure_slot(struct pci_dev *dev)
>                        (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
>                return;
>
> -       ret = pci_set_payload(dev);
> -       if (ret)
> -               dev_warn(&dev->dev, "could not set device max payload\n");
> +       pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss);
>
>        memset(&hpp, 0, sizeof(hpp));
>        ret = pci_get_hp_params(dev, &hpp);
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 56098b3..4c13d20 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
>  unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
>  unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
>
> +enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE;
> +
>  /*
>  * The default CLS is used if arch didn't set CLS explicitly and not
>  * all pci devices agree on the same value.  Arch can override either
> @@ -3218,6 +3220,67 @@ out:
>  EXPORT_SYMBOL(pcie_set_readrq);
>
>  /**
> + * pcie_get_mps - get PCI Express maximum payload size
> + * @dev: PCI device to query
> + *
> + * Returns maximum payload size in bytes
> + *    or appropriate error value.
> + */
> +int pcie_get_mps(struct pci_dev *dev)
> +{
> +       int ret, cap;
> +       u16 ctl;
> +
> +       cap = pci_pcie_cap(dev);
> +       if (!cap)
> +               return -EINVAL;
> +
> +       ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
> +       if (!ret)
> +               ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5);
> +
> +       return ret;
> +}
> +
> +/**
> + * pcie_set_mps - set PCI Express maximum payload size
> + * @dev: PCI device to query
> + * @rq: maximum payload size in bytes
> + *    valid values are 128, 256, 512, 1024, 2048, 4096
> + *
> + * If possible sets maximum payload size
> + */
> +int pcie_set_mps(struct pci_dev *dev, int mps)
> +{
> +       int cap, err = -EINVAL;
> +       u16 ctl, v;
> +
> +       if (mps < 128 || mps > 4096 || !is_power_of_2(mps))
> +               goto out;
> +
> +       v = ffs(mps) - 8;
> +       if (v > dev->pcie_mpss)
> +               goto out;
> +       v <<= 5;
> +
> +       cap = pci_pcie_cap(dev);
> +       if (!cap)
> +               goto out;
> +
> +       err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
> +       if (err)
> +               goto out;
> +
> +       if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) {
> +               ctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
> +               ctl |= v;
> +               err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl);
> +       }
> +out:
> +       return err;
> +}
> +
> +/**
>  * pci_select_bars - Make BAR mask from the type of resource
>  * @dev: the PCI device for which BAR mask is made
>  * @flags: resource type mask to be selected
> @@ -3498,6 +3561,10 @@ static int __init pci_setup(char *str)
>                                pci_hotplug_io_size = memparse(str + 9, &str);
>                        } else if (!strncmp(str, "hpmemsize=", 10)) {
>                                pci_hotplug_mem_size = memparse(str + 10, &str);
> +                       } else if (!strncmp(str, "pcie_bus_safe", 13)) {
> +                               pcie_bus_config = PCIE_BUS_SAFE;
> +                       } else if (!strncmp(str, "pcie_bus_perf", 13)) {
> +                               pcie_bus_config = PCIE_BUS_PERFORMANCE;
>                        } else {
>                                printk(KERN_ERR "PCI: Unknown option `%s'\n",
>                                                str);
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 48849ff..fceca7c 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -860,6 +860,8 @@ void set_pcie_port_type(struct pci_dev *pdev)
>        pdev->pcie_cap = pos;
>        pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
>        pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
> +       pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
> +       pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
>  }
>
>  void set_pcie_hotplug_bridge(struct pci_dev *pdev)
> @@ -1327,6 +1329,149 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
>        return nr;
>  }
>
> +static int pcie_find_smpss(struct pci_dev *dev, void *data)
> +{
> +       u8 *smpss = data;
> +
> +       if (!pci_is_pcie(dev))
> +               return 0;
> +
> +       /* For PCIE hotplug enabled slots not connected directly to a
> +        * PCI-E root port, there can be problems when hotplugging
> +        * devices.  This is due to the possibility of hotplugging a
> +        * device into the fabric with a smaller MPS that the devices
> +        * currently running have configured.  Modifying the MPS on the
> +        * running devices could cause a fatal bus error due to an
> +        * incoming frame being larger than the newly configured MPS.
> +        * To work around this, the MPS for the entire fabric must be
> +        * set to the minimum size.  Any devices hotplugged into this
> +        * fabric will have the minimum MPS set.  If the PCI hotplug
> +        * slot is directly connected to the root port and there are not
> +        * other devices on the fabric (which seems to be the most
> +        * common case), then this is not an issue and MPS discovery
> +        * will occur as normal.
> +        */
> +       if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
> +           dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
> +               *smpss = 0;
> +
> +       if (*smpss > dev->pcie_mpss)
> +               *smpss = dev->pcie_mpss;
> +
> +       return 0;
> +}
> +
> +static void pcie_write_mps(struct pci_dev *dev, int mps)
> +{
> +       int rc, dev_mpss;
> +
> +       dev_mpss = 128 << dev->pcie_mpss;
> +
> +       if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
> +               if (dev->bus->self) {
> +                       dev_dbg(&dev->bus->dev, "Bus MPSS %d\n",
> +                               128 << dev->bus->self->pcie_mpss);
> +
> +                       /* For "MPS Force Max", the assumption is made that
> +                        * downstream communication will never be larger than
> +                        * the MRRS.  So, the MPS only needs to be configured
> +                        * for the upstream communication.  This being the case,
> +                        * walk from the top down and set the MPS of the child
> +                        * to that of the parent bus.
> +                        */
> +                       mps = 128 << dev->bus->self->pcie_mpss;
> +                       if (mps > dev_mpss)
> +                               dev_warn(&dev->dev, "MPS configured higher than"
> +                                        " maximum supported by the device.  If"
> +                                        " a bus issue occurs, try running with"
> +                                        " pci=pcie_bus_safe.\n");
> +               }
> +
> +               dev->pcie_mpss = ffs(mps) - 8;
> +       }
> +
> +       rc = pcie_set_mps(dev, mps);
> +       if (rc)
> +               dev_err(&dev->dev, "Failed attempting to set the MPS\n");
> +}
> +
> +static void pcie_write_mrrs(struct pci_dev *dev, int mps)
> +{
> +       int rc, mrrs;
> +
> +       if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
> +               int dev_mpss = 128 << dev->pcie_mpss;
> +
> +               /* For Max performance, the MRRS must be set to the largest
> +                * supported value.  However, it cannot be configured larger
> +                * than the MPS the device or the bus can support.  This assumes
> +                * that the largest MRRS available on the device cannot be
> +                * smaller than the device MPSS.
> +                */
> +               mrrs = mps < dev_mpss ? mps : dev_mpss;
> +       } else
> +               /* In the "safe" case, configure the MRRS for fairness on the
> +                * bus by making all devices have the same size
> +                */
> +               mrrs = mps;
> +
> +
> +       /* MRRS is a R/W register.  Invalid values can be written, but a
> +        * subsiquent read will verify if the value is acceptable or not.
> +        * If the MRRS value provided is not acceptable (e.g., too large),
> +        * shrink the value until it is acceptable to the HW.
> +        */
> +       while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
> +               rc = pcie_set_readrq(dev, mrrs);
> +               if (rc)
> +                       dev_err(&dev->dev, "Failed attempting to set the MRRS\n");
> +
> +               mrrs /= 2;
> +       }
> +}
> +
> +static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
> +{
> +       int mps = 128 << *(u8 *)data;
> +
> +       if (!pci_is_pcie(dev))
> +               return 0;
> +
> +       dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
> +                pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
> +
> +       pcie_write_mps(dev, mps);
> +       pcie_write_mrrs(dev, mps);
> +
> +       dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
> +                pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
> +
> +       return 0;
> +}
> +
> +/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down,
> + * parents then children fashion.  If this changes, then this code will not
> + * work as designed.
> + */
> +void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss)
> +{
> +       u8 smpss = mpss;
> +
> +       if (!bus->self)
> +               return;
> +
> +       if (!pci_is_pcie(bus->self))
> +               return;
> +
> +       if (pcie_bus_config == PCIE_BUS_SAFE) {
> +               pcie_find_smpss(bus->self, &smpss);
> +               pci_walk_bus(bus, pcie_find_smpss, &smpss);
> +       }
> +
> +       pcie_bus_configure_set(bus->self, &smpss);
> +       pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
> +}
> +
>  unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
>  {
>        unsigned int devfn, pass, max = bus->secondary;
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index c446b5c..161aa45 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -251,7 +251,8 @@ struct pci_dev {
>        u8              revision;       /* PCI revision, low byte of class word */
>        u8              hdr_type;       /* PCI header type (`multi' flag masked out) */
>        u8              pcie_cap;       /* PCI-E capability offset */
> -       u8              pcie_type;      /* PCI-E device/port type */
> +       u8              pcie_type:4;    /* PCI-E device/port type */
> +       u8              pcie_mpss:3;    /* PCI-E Max Payload Size Supported */
>        u8              rom_base_reg;   /* which config register controls the ROM */
>        u8              pin;            /* which interrupt pin this device uses */
>
> @@ -617,6 +618,16 @@ struct pci_driver {
>  /* these external functions are only available when PCI support is enabled */
>  #ifdef CONFIG_PCI
>
> +extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss);
> +
> +enum pcie_bus_config_types {
> +       PCIE_BUS_PERFORMANCE,
> +       PCIE_BUS_SAFE,
> +       PCIE_BUS_PEER2PEER,
> +};
> +
> +extern enum pcie_bus_config_types pcie_bus_config;
> +
>  extern struct bus_type pci_bus_type;
>
>  /* Do NOT directly access these two variables, unless you are arch specific pci
> @@ -796,6 +807,8 @@ int pcix_get_mmrbc(struct pci_dev *dev);
>  int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
>  int pcie_get_readrq(struct pci_dev *dev);
>  int pcie_set_readrq(struct pci_dev *dev, int rq);
> +int pcie_get_mps(struct pci_dev *dev);
> +int pcie_set_mps(struct pci_dev *dev, int mps);
>  int __pci_reset_function(struct pci_dev *dev);
>  int pci_reset_function(struct pci_dev *dev);
>  void pci_update_resource(struct pci_dev *dev, int resno);
> --
> 1.7.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Barnes July 22, 2011, 3:16 p.m. UTC | #2
On Wed, 20 Jul 2011 15:20:54 -0500
Jon Mason <mason@myri.com> wrote:

> On a given PCI-E fabric, each device, bridge, and root port can have a
> different PCI-E maximum payload size.  There is a sizable performance
> boost for having the largest possible maximum payload size on each PCI-E
> device.  However, if improperly configured, fatal bus errors can occur.
> Thus, it is important to ensure that PCI-E payloads sends by a device
> are never larger than the MPS setting of all devices on the way to the
> destination.
> 
> This can be achieved two ways:
> 
> - A conservative approach is to use the smallest common denominator of
>   the entire tree below a root complex for every device on that fabric.
> 
> This means for example that having a 128 bytes MPS USB controller on one
> leg of a switch will dramatically reduce performances of a video card or
> 10GE adapter on another leg of that same switch.
> 
> It also means that any hierarchy supporting hotplug slots (including
> expresscard or thunderbolt I suppose, dbl check that) will have to be
> entirely clamped to 128 bytes since we cannot predict what will be
> plugged into those slots, and we cannot change the MPS on a "live"
> system.
> 
> - A more optimal way is possible, if it falls within a couple of
>   constraints:
> * The top-level host bridge will never generate packets larger than the
>   smallest TLP (or if it can be controlled independently from its MPS at
>   least)
> * The device will never generate packets larger than MPS (which can be
>   configured via MRRS)
> * No support of direct PCI-E <-> PCI-E transfers between devices without
>   some additional code to specifically deal with that case
> 
> Then we can use an approach that basically ignores downstream requests
> and focuses exclusively on upstream requests. In that case, all we need
> to care about is that a device MPS is no larger than its parent MPS,
> which allows us to keep all switches/bridges to the max MPS supported by
> their parent and eventually the PHB.
> 
> In this case, your USB controller would no longer "starve" your 10GE
> Ethernet and your hotplug slots won't affect your global MPS.
> Additionally, the hotplugged devices themselves can be configured to a
> larger MPS up to the value configured in the hotplug bridge.
> 
> To choose between the two available options, two PCI kernel boot args
> have been added to the PCI calls.  "pcie_bus_safe" will provide the
> former behavior, while "pcie_bus_perf" will perform the latter behavior.
> By default, the latter behavior is used.
> 
> NOTE: due to the location of the enablement, each arch will need to add
> calls to this function.  This patch only enables x86.
> 
> This patch includes a number of changes recommended by Benjamin
> Herrenschmidt.
> 
> Signed-off-by: Jon Mason <mason@myri.com>

With some tested-bys I think we can still sneak this into 3.1.

Jordon & Ben?

Thanks,
jordan hargrave July 26, 2011, 8:56 p.m. UTC | #3
Tested-by: Jordan_Hargrave@dell.com

On Wed, Jul 20, 2011 at 5:58 PM, jordan hargrave <jharg93@gmail.com> wrote:
> Looks good, just realized my patch was broken too! :(
>
> On Wed, Jul 20, 2011 at 3:20 PM, Jon Mason <mason@myri.com> wrote:
>> On a given PCI-E fabric, each device, bridge, and root port can have a
>> different PCI-E maximum payload size.  There is a sizable performance
>> boost for having the largest possible maximum payload size on each PCI-E
>> device.  However, if improperly configured, fatal bus errors can occur.
>> Thus, it is important to ensure that PCI-E payloads sends by a device
>> are never larger than the MPS setting of all devices on the way to the
>> destination.
>>
>> This can be achieved two ways:
>>
>> - A conservative approach is to use the smallest common denominator of
>>  the entire tree below a root complex for every device on that fabric.
>>
>> This means for example that having a 128 bytes MPS USB controller on one
>> leg of a switch will dramatically reduce performances of a video card or
>> 10GE adapter on another leg of that same switch.
>>
>> It also means that any hierarchy supporting hotplug slots (including
>> expresscard or thunderbolt I suppose, dbl check that) will have to be
>> entirely clamped to 128 bytes since we cannot predict what will be
>> plugged into those slots, and we cannot change the MPS on a "live"
>> system.
>>
>> - A more optimal way is possible, if it falls within a couple of
>>  constraints:
>> * The top-level host bridge will never generate packets larger than the
>>  smallest TLP (or if it can be controlled independently from its MPS at
>>  least)
>> * The device will never generate packets larger than MPS (which can be
>>  configured via MRRS)
>> * No support of direct PCI-E <-> PCI-E transfers between devices without
>>  some additional code to specifically deal with that case
>>
>> Then we can use an approach that basically ignores downstream requests
>> and focuses exclusively on upstream requests. In that case, all we need
>> to care about is that a device MPS is no larger than its parent MPS,
>> which allows us to keep all switches/bridges to the max MPS supported by
>> their parent and eventually the PHB.
>>
>> In this case, your USB controller would no longer "starve" your 10GE
>> Ethernet and your hotplug slots won't affect your global MPS.
>> Additionally, the hotplugged devices themselves can be configured to a
>> larger MPS up to the value configured in the hotplug bridge.
>>
>> To choose between the two available options, two PCI kernel boot args
>> have been added to the PCI calls.  "pcie_bus_safe" will provide the
>> former behavior, while "pcie_bus_perf" will perform the latter behavior.
>> By default, the latter behavior is used.
>>
>> NOTE: due to the location of the enablement, each arch will need to add
>> calls to this function.  This patch only enables x86.
>>
>> This patch includes a number of changes recommended by Benjamin
>> Herrenschmidt.
>>
>> Signed-off-by: Jon Mason <mason@myri.com>
>> ---
>>  arch/x86/pci/acpi.c              |    9 +++
>>  drivers/pci/hotplug/pcihp_slot.c |   45 +------------
>>  drivers/pci/pci.c                |   67 +++++++++++++++++
>>  drivers/pci/probe.c              |  145 ++++++++++++++++++++++++++++++++++++++
>>  include/linux/pci.h              |   15 ++++-
>>  5 files changed, 236 insertions(+), 45 deletions(-)
>>
>> diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
>> index 0972315..4aeac46 100644
>> --- a/arch/x86/pci/acpi.c
>> +++ b/arch/x86/pci/acpi.c
>> @@ -361,6 +361,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
>>                }
>>        }
>>
>> +       /* After the PCI-E bus has been walked and all devices discovered,
>> +        * configure any settings of the fabric that might be necessary.
>> +        */
>> +       if (bus) {
>> +               struct pci_bus *child;
>> +               list_for_each_entry(child, &bus->children, node)
>> +                       pcie_bus_configure_settings(child, child->self->pcie_mpss);
>> +       }
>> +
>>        if (!bus)
>>                kfree(sd);
>>
>> diff --git a/drivers/pci/hotplug/pcihp_slot.c b/drivers/pci/hotplug/pcihp_slot.c
>> index 749fdf0..753b21a 100644
>> --- a/drivers/pci/hotplug/pcihp_slot.c
>> +++ b/drivers/pci/hotplug/pcihp_slot.c
>> @@ -158,47 +158,6 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
>>         */
>>  }
>>
>> -/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */
>> -static int pci_set_payload(struct pci_dev *dev)
>> -{
>> -       int pos, ppos;
>> -       u16 pctl, psz;
>> -       u16 dctl, dsz, dcap, dmax;
>> -       struct pci_dev *parent;
>> -
>> -       parent = dev->bus->self;
>> -       pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
>> -       if (!pos)
>> -               return 0;
>> -
>> -       /* Read Device MaxPayload capability and setting */
>> -       pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl);
>> -       pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap);
>> -       dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
>> -       dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD);
>> -
>> -       /* Read Parent MaxPayload setting */
>> -       ppos = pci_find_capability(parent, PCI_CAP_ID_EXP);
>> -       if (!ppos)
>> -               return 0;
>> -       pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl);
>> -       psz = (pctl &  PCI_EXP_DEVCTL_PAYLOAD) >> 5;
>> -
>> -       /* If parent payload > device max payload -> error
>> -        * If parent payload > device payload -> set speed
>> -        * If parent payload <= device payload -> do nothing
>> -        */
>> -       if (psz > dmax)
>> -               return -1;
>> -       else if (psz > dsz) {
>> -               dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz);
>> -               pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
>> -                                     (dctl & ~PCI_EXP_DEVCTL_PAYLOAD) +
>> -                                     (psz << 5));
>> -       }
>> -       return 0;
>> -}
>> -
>>  void pci_configure_slot(struct pci_dev *dev)
>>  {
>>        struct pci_dev *cdev;
>> @@ -210,9 +169,7 @@ void pci_configure_slot(struct pci_dev *dev)
>>                        (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
>>                return;
>>
>> -       ret = pci_set_payload(dev);
>> -       if (ret)
>> -               dev_warn(&dev->dev, "could not set device max payload\n");
>> +       pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss);
>>
>>        memset(&hpp, 0, sizeof(hpp));
>>        ret = pci_get_hp_params(dev, &hpp);
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index 56098b3..4c13d20 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
>>  unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
>>  unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
>>
>> +enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE;
>> +
>>  /*
>>  * The default CLS is used if arch didn't set CLS explicitly and not
>>  * all pci devices agree on the same value.  Arch can override either
>> @@ -3218,6 +3220,67 @@ out:
>>  EXPORT_SYMBOL(pcie_set_readrq);
>>
>>  /**
>> + * pcie_get_mps - get PCI Express maximum payload size
>> + * @dev: PCI device to query
>> + *
>> + * Returns maximum payload size in bytes
>> + *    or appropriate error value.
>> + */
>> +int pcie_get_mps(struct pci_dev *dev)
>> +{
>> +       int ret, cap;
>> +       u16 ctl;
>> +
>> +       cap = pci_pcie_cap(dev);
>> +       if (!cap)
>> +               return -EINVAL;
>> +
>> +       ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
>> +       if (!ret)
>> +               ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5);
>> +
>> +       return ret;
>> +}
>> +
>> +/**
>> + * pcie_set_mps - set PCI Express maximum payload size
>> + * @dev: PCI device to query
>> + * @rq: maximum payload size in bytes
>> + *    valid values are 128, 256, 512, 1024, 2048, 4096
>> + *
>> + * If possible sets maximum payload size
>> + */
>> +int pcie_set_mps(struct pci_dev *dev, int mps)
>> +{
>> +       int cap, err = -EINVAL;
>> +       u16 ctl, v;
>> +
>> +       if (mps < 128 || mps > 4096 || !is_power_of_2(mps))
>> +               goto out;
>> +
>> +       v = ffs(mps) - 8;
>> +       if (v > dev->pcie_mpss)
>> +               goto out;
>> +       v <<= 5;
>> +
>> +       cap = pci_pcie_cap(dev);
>> +       if (!cap)
>> +               goto out;
>> +
>> +       err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
>> +       if (err)
>> +               goto out;
>> +
>> +       if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) {
>> +               ctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
>> +               ctl |= v;
>> +               err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl);
>> +       }
>> +out:
>> +       return err;
>> +}
>> +
>> +/**
>>  * pci_select_bars - Make BAR mask from the type of resource
>>  * @dev: the PCI device for which BAR mask is made
>>  * @flags: resource type mask to be selected
>> @@ -3498,6 +3561,10 @@ static int __init pci_setup(char *str)
>>                                pci_hotplug_io_size = memparse(str + 9, &str);
>>                        } else if (!strncmp(str, "hpmemsize=", 10)) {
>>                                pci_hotplug_mem_size = memparse(str + 10, &str);
>> +                       } else if (!strncmp(str, "pcie_bus_safe", 13)) {
>> +                               pcie_bus_config = PCIE_BUS_SAFE;
>> +                       } else if (!strncmp(str, "pcie_bus_perf", 13)) {
>> +                               pcie_bus_config = PCIE_BUS_PERFORMANCE;
>>                        } else {
>>                                printk(KERN_ERR "PCI: Unknown option `%s'\n",
>>                                                str);
>> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
>> index 48849ff..fceca7c 100644
>> --- a/drivers/pci/probe.c
>> +++ b/drivers/pci/probe.c
>> @@ -860,6 +860,8 @@ void set_pcie_port_type(struct pci_dev *pdev)
>>        pdev->pcie_cap = pos;
>>        pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
>>        pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
>> +       pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
>> +       pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
>>  }
>>
>>  void set_pcie_hotplug_bridge(struct pci_dev *pdev)
>> @@ -1327,6 +1329,149 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
>>        return nr;
>>  }
>>
>> +static int pcie_find_smpss(struct pci_dev *dev, void *data)
>> +{
>> +       u8 *smpss = data;
>> +
>> +       if (!pci_is_pcie(dev))
>> +               return 0;
>> +
>> +       /* For PCIE hotplug enabled slots not connected directly to a
>> +        * PCI-E root port, there can be problems when hotplugging
>> +        * devices.  This is due to the possibility of hotplugging a
>> +        * device into the fabric with a smaller MPS that the devices
>> +        * currently running have configured.  Modifying the MPS on the
>> +        * running devices could cause a fatal bus error due to an
>> +        * incoming frame being larger than the newly configured MPS.
>> +        * To work around this, the MPS for the entire fabric must be
>> +        * set to the minimum size.  Any devices hotplugged into this
>> +        * fabric will have the minimum MPS set.  If the PCI hotplug
>> +        * slot is directly connected to the root port and there are not
>> +        * other devices on the fabric (which seems to be the most
>> +        * common case), then this is not an issue and MPS discovery
>> +        * will occur as normal.
>> +        */
>> +       if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
>> +           dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
>> +               *smpss = 0;
>> +
>> +       if (*smpss > dev->pcie_mpss)
>> +               *smpss = dev->pcie_mpss;
>> +
>> +       return 0;
>> +}
>> +
>> +static void pcie_write_mps(struct pci_dev *dev, int mps)
>> +{
>> +       int rc, dev_mpss;
>> +
>> +       dev_mpss = 128 << dev->pcie_mpss;
>> +
>> +       if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
>> +               if (dev->bus->self) {
>> +                       dev_dbg(&dev->bus->dev, "Bus MPSS %d\n",
>> +                               128 << dev->bus->self->pcie_mpss);
>> +
>> +                       /* For "MPS Force Max", the assumption is made that
>> +                        * downstream communication will never be larger than
>> +                        * the MRRS.  So, the MPS only needs to be configured
>> +                        * for the upstream communication.  This being the case,
>> +                        * walk from the top down and set the MPS of the child
>> +                        * to that of the parent bus.
>> +                        */
>> +                       mps = 128 << dev->bus->self->pcie_mpss;
>> +                       if (mps > dev_mpss)
>> +                               dev_warn(&dev->dev, "MPS configured higher than"
>> +                                        " maximum supported by the device.  If"
>> +                                        " a bus issue occurs, try running with"
>> +                                        " pci=pcie_bus_safe.\n");
>> +               }
>> +
>> +               dev->pcie_mpss = ffs(mps) - 8;
>> +       }
>> +
>> +       rc = pcie_set_mps(dev, mps);
>> +       if (rc)
>> +               dev_err(&dev->dev, "Failed attempting to set the MPS\n");
>> +}
>> +
>> +static void pcie_write_mrrs(struct pci_dev *dev, int mps)
>> +{
>> +       int rc, mrrs;
>> +
>> +       if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
>> +               int dev_mpss = 128 << dev->pcie_mpss;
>> +
>> +               /* For Max performance, the MRRS must be set to the largest
>> +                * supported value.  However, it cannot be configured larger
>> +                * than the MPS the device or the bus can support.  This assumes
>> +                * that the largest MRRS available on the device cannot be
>> +                * smaller than the device MPSS.
>> +                */
>> +               mrrs = mps < dev_mpss ? mps : dev_mpss;
>> +       } else
>> +               /* In the "safe" case, configure the MRRS for fairness on the
>> +                * bus by making all devices have the same size
>> +                */
>> +               mrrs = mps;
>> +
>> +
>> +       /* MRRS is a R/W register.  Invalid values can be written, but a
>> +        * subsiquent read will verify if the value is acceptable or not.
>> +        * If the MRRS value provided is not acceptable (e.g., too large),
>> +        * shrink the value until it is acceptable to the HW.
>> +        */
>> +       while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
>> +               rc = pcie_set_readrq(dev, mrrs);
>> +               if (rc)
>> +                       dev_err(&dev->dev, "Failed attempting to set the MRRS\n");
>> +
>> +               mrrs /= 2;
>> +       }
>> +}
>> +
>> +static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
>> +{
>> +       int mps = 128 << *(u8 *)data;
>> +
>> +       if (!pci_is_pcie(dev))
>> +               return 0;
>> +
>> +       dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
>> +                pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
>> +
>> +       pcie_write_mps(dev, mps);
>> +       pcie_write_mrrs(dev, mps);
>> +
>> +       dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
>> +                pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
>> +
>> +       return 0;
>> +}
>> +
>> +/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down,
>> + * parents then children fashion.  If this changes, then this code will not
>> + * work as designed.
>> + */
>> +void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss)
>> +{
>> +       u8 smpss = mpss;
>> +
>> +       if (!bus->self)
>> +               return;
>> +
>> +       if (!pci_is_pcie(bus->self))
>> +               return;
>> +
>> +       if (pcie_bus_config == PCIE_BUS_SAFE) {
>> +               pcie_find_smpss(bus->self, &smpss);
>> +               pci_walk_bus(bus, pcie_find_smpss, &smpss);
>> +       }
>> +
>> +       pcie_bus_configure_set(bus->self, &smpss);
>> +       pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
>> +}
>> +
>>  unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
>>  {
>>        unsigned int devfn, pass, max = bus->secondary;
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index c446b5c..161aa45 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -251,7 +251,8 @@ struct pci_dev {
>>        u8              revision;       /* PCI revision, low byte of class word */
>>        u8              hdr_type;       /* PCI header type (`multi' flag masked out) */
>>        u8              pcie_cap;       /* PCI-E capability offset */
>> -       u8              pcie_type;      /* PCI-E device/port type */
>> +       u8              pcie_type:4;    /* PCI-E device/port type */
>> +       u8              pcie_mpss:3;    /* PCI-E Max Payload Size Supported */
>>        u8              rom_base_reg;   /* which config register controls the ROM */
>>        u8              pin;            /* which interrupt pin this device uses */
>>
>> @@ -617,6 +618,16 @@ struct pci_driver {
>>  /* these external functions are only available when PCI support is enabled */
>>  #ifdef CONFIG_PCI
>>
>> +extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss);
>> +
>> +enum pcie_bus_config_types {
>> +       PCIE_BUS_PERFORMANCE,
>> +       PCIE_BUS_SAFE,
>> +       PCIE_BUS_PEER2PEER,
>> +};
>> +
>> +extern enum pcie_bus_config_types pcie_bus_config;
>> +
>>  extern struct bus_type pci_bus_type;
>>
>>  /* Do NOT directly access these two variables, unless you are arch specific pci
>> @@ -796,6 +807,8 @@ int pcix_get_mmrbc(struct pci_dev *dev);
>>  int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
>>  int pcie_get_readrq(struct pci_dev *dev);
>>  int pcie_set_readrq(struct pci_dev *dev, int rq);
>> +int pcie_get_mps(struct pci_dev *dev);
>> +int pcie_set_mps(struct pci_dev *dev, int mps);
>>  int __pci_reset_function(struct pci_dev *dev);
>>  int pci_reset_function(struct pci_dev *dev);
>>  void pci_update_resource(struct pci_dev *dev, int resno);
>> --
>> 1.7.6
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Mason July 26, 2011, 11:15 p.m. UTC | #4
On Fri, Jul 22, 2011 at 10:16 AM, Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> On Wed, 20 Jul 2011 15:20:54 -0500
> Jon Mason <mason@myri.com> wrote:
>
>> On a given PCI-E fabric, each device, bridge, and root port can have a
>> different PCI-E maximum payload size.  There is a sizable performance
>> boost for having the largest possible maximum payload size on each PCI-E
>> device.  However, if improperly configured, fatal bus errors can occur.
>> Thus, it is important to ensure that PCI-E payloads sends by a device
>> are never larger than the MPS setting of all devices on the way to the
>> destination.
>>
>> This can be achieved two ways:
>>
>> - A conservative approach is to use the smallest common denominator of
>>   the entire tree below a root complex for every device on that fabric.
>>
>> This means for example that having a 128 bytes MPS USB controller on one
>> leg of a switch will dramatically reduce performances of a video card or
>> 10GE adapter on another leg of that same switch.
>>
>> It also means that any hierarchy supporting hotplug slots (including
>> expresscard or thunderbolt I suppose, dbl check that) will have to be
>> entirely clamped to 128 bytes since we cannot predict what will be
>> plugged into those slots, and we cannot change the MPS on a "live"
>> system.
>>
>> - A more optimal way is possible, if it falls within a couple of
>>   constraints:
>> * The top-level host bridge will never generate packets larger than the
>>   smallest TLP (or if it can be controlled independently from its MPS at
>>   least)
>> * The device will never generate packets larger than MPS (which can be
>>   configured via MRRS)
>> * No support of direct PCI-E <-> PCI-E transfers between devices without
>>   some additional code to specifically deal with that case
>>
>> Then we can use an approach that basically ignores downstream requests
>> and focuses exclusively on upstream requests. In that case, all we need
>> to care about is that a device MPS is no larger than its parent MPS,
>> which allows us to keep all switches/bridges to the max MPS supported by
>> their parent and eventually the PHB.
>>
>> In this case, your USB controller would no longer "starve" your 10GE
>> Ethernet and your hotplug slots won't affect your global MPS.
>> Additionally, the hotplugged devices themselves can be configured to a
>> larger MPS up to the value configured in the hotplug bridge.
>>
>> To choose between the two available options, two PCI kernel boot args
>> have been added to the PCI calls.  "pcie_bus_safe" will provide the
>> former behavior, while "pcie_bus_perf" will perform the latter behavior.
>> By default, the latter behavior is used.
>>
>> NOTE: due to the location of the enablement, each arch will need to add
>> calls to this function.  This patch only enables x86.
>>
>> This patch includes a number of changes recommended by Benjamin
>> Herrenschmidt.
>>
>> Signed-off-by: Jon Mason <mason@myri.com>
>
> With some tested-bys I think we can still sneak this into 3.1.
>
> Jordon & Ben?

This patch will only work on x86, due to its new enablement location
(per Ben's suggestion).  So, Ben can't really test it on ppc without
adding support for it.  I've tested it on the faulty motherboard I was
seeing the original MPS issue on, and the issue is corrected.  I've
also tested the patch on an Intel motherboard that has only 128B MPS,
and saw no issues.

Thanks,
Jon

>
> Thanks,
> --
> Jesse Barnes, Intel Open Source Technology Center
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Barnes Aug. 1, 2011, 6:52 p.m. UTC | #5
On Wed, 20 Jul 2011 15:20:54 -0500
Jon Mason <mason@myri.com> wrote:

> On a given PCI-E fabric, each device, bridge, and root port can have a
> different PCI-E maximum payload size.  There is a sizable performance
> boost for having the largest possible maximum payload size on each PCI-E
> device.  However, if improperly configured, fatal bus errors can occur.
> Thus, it is important to ensure that PCI-E payloads sends by a device
> are never larger than the MPS setting of all devices on the way to the
> destination.

Applied to my for-linus branch, thanks for being so persistent Jon!
Benjamin Herrenschmidt Sept. 5, 2011, 5:58 p.m. UTC | #6
On Tue, 2011-07-26 at 18:15 -0500, Jon Mason wrote:
> This patch will only work on x86, due to its new enablement location
> (per Ben's suggestion).  So, Ben can't really test it on ppc without
> adding support for it.  I've tested it on the faulty motherboard I was
> seeing the original MPS issue on, and the issue is corrected.  I've
> also tested the patch on an Intel motherboard that has only 128B MPS,
> and saw no issues. 

BTW, I've been meaning to test that for ages and keep getting distracted
and travelling around :-)

I'll probably adapt powerpc to use it in a couple of weeks tho.

Thanks for doing that work Jon !

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

different PCI-E maximum payload size.  There is a sizable performance
boost for having the largest possible maximum payload size on each PCI-E
device.  However, if improperly configured, fatal bus errors can occur.
Thus, it is important to ensure that PCI-E payloads sends by a device
are never larger than the MPS setting of all devices on the way to the
destination.

This can be achieved two ways:

- A conservative approach is to use the smallest common denominator of
  the entire tree below a root complex for every device on that fabric.

This means for example that having a 128 bytes MPS USB controller on one
leg of a switch will dramatically reduce performances of a video card or
10GE adapter on another leg of that same switch.

It also means that any hierarchy supporting hotplug slots (including
expresscard or thunderbolt I suppose, dbl check that) will have to be
entirely clamped to 128 bytes since we cannot predict what will be
plugged into those slots, and we cannot change the MPS on a "live"
system.

- A more optimal way is possible, if it falls within a couple of
  constraints:
* The top-level host bridge will never generate packets larger than the
  smallest TLP (or if it can be controlled independently from its MPS at
  least)
* The device will never generate packets larger than MPS (which can be
  configured via MRRS)
* No support of direct PCI-E <-> PCI-E transfers between devices without
  some additional code to specifically deal with that case

Then we can use an approach that basically ignores downstream requests
and focuses exclusively on upstream requests. In that case, all we need
to care about is that a device MPS is no larger than its parent MPS,
which allows us to keep all switches/bridges to the max MPS supported by
their parent and eventually the PHB.

In this case, your USB controller would no longer "starve" your 10GE
Ethernet and your hotplug slots won't affect your global MPS.
Additionally, the hotplugged devices themselves can be configured to a
larger MPS up to the value configured in the hotplug bridge.

To choose between the two available options, two PCI kernel boot args
have been added to the PCI calls.  "pcie_bus_safe" will provide the
former behavior, while "pcie_bus_perf" will perform the latter behavior.
By default, the latter behavior is used.

NOTE: due to the location of the enablement, each arch will need to add
calls to this function.  This patch only enables x86.

This patch includes a number of changes recommended by Benjamin
Herrenschmidt.

Signed-off-by: Jon Mason <mason@myri.com>
---
 arch/x86/pci/acpi.c              |    9 +++
 drivers/pci/hotplug/pcihp_slot.c |   45 +------------
 drivers/pci/pci.c                |   67 +++++++++++++++++
 drivers/pci/probe.c              |  145 ++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h              |   15 ++++-
 5 files changed, 236 insertions(+), 45 deletions(-)

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 0972315..4aeac46 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -361,6 +361,15 @@  struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 		}
 	}
 
+	/* After the PCI-E bus has been walked and all devices discovered,
+	 * configure any settings of the fabric that might be necessary.
+	 */
+	if (bus) {
+		struct pci_bus *child;
+		list_for_each_entry(child, &bus->children, node)
+			pcie_bus_configure_settings(child, child->self->pcie_mpss);
+	}
+
 	if (!bus)
 		kfree(sd);
 
diff --git a/drivers/pci/hotplug/pcihp_slot.c b/drivers/pci/hotplug/pcihp_slot.c
index 749fdf0..753b21a 100644
--- a/drivers/pci/hotplug/pcihp_slot.c
+++ b/drivers/pci/hotplug/pcihp_slot.c
@@ -158,47 +158,6 @@  static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
 	 */
 }
 
-/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */
-static int pci_set_payload(struct pci_dev *dev)
-{
-       int pos, ppos;
-       u16 pctl, psz;
-       u16 dctl, dsz, dcap, dmax;
-       struct pci_dev *parent;
-
-       parent = dev->bus->self;
-       pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
-       if (!pos)
-               return 0;
-
-       /* Read Device MaxPayload capability and setting */
-       pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl);
-       pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap);
-       dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
-       dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD);
-
-       /* Read Parent MaxPayload setting */
-       ppos = pci_find_capability(parent, PCI_CAP_ID_EXP);
-       if (!ppos)
-               return 0;
-       pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl);
-       psz = (pctl &  PCI_EXP_DEVCTL_PAYLOAD) >> 5;
-
-       /* If parent payload > device max payload -> error
-        * If parent payload > device payload -> set speed
-        * If parent payload <= device payload -> do nothing
-        */
-       if (psz > dmax)
-               return -1;
-       else if (psz > dsz) {
-               dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz);
-               pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
-                                     (dctl & ~PCI_EXP_DEVCTL_PAYLOAD) +
-                                     (psz << 5));
-       }
-       return 0;
-}
-
 void pci_configure_slot(struct pci_dev *dev)
 {
 	struct pci_dev *cdev;
@@ -210,9 +169,7 @@  void pci_configure_slot(struct pci_dev *dev)
 			(dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
 		return;
 
-       ret = pci_set_payload(dev);
-       if (ret)
-               dev_warn(&dev->dev, "could not set device max payload\n");
+	pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss);
 
 	memset(&hpp, 0, sizeof(hpp));
 	ret = pci_get_hp_params(dev, &hpp);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 56098b3..4c13d20 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -77,6 +77,8 @@  unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
 unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
 unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
 
+enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE;
+
 /*
  * The default CLS is used if arch didn't set CLS explicitly and not
  * all pci devices agree on the same value.  Arch can override either
@@ -3218,6 +3220,67 @@  out:
 EXPORT_SYMBOL(pcie_set_readrq);
 
 /**
+ * pcie_get_mps - get PCI Express maximum payload size
+ * @dev: PCI device to query
+ *
+ * Returns maximum payload size in bytes
+ *    or appropriate error value.
+ */
+int pcie_get_mps(struct pci_dev *dev)
+{
+	int ret, cap;
+	u16 ctl;
+
+	cap = pci_pcie_cap(dev);
+	if (!cap)
+		return -EINVAL;
+
+	ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
+	if (!ret)
+		ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5);
+
+	return ret;
+}
+
+/**
+ * pcie_set_mps - set PCI Express maximum payload size
+ * @dev: PCI device to query
+ * @rq: maximum payload size in bytes
+ *    valid values are 128, 256, 512, 1024, 2048, 4096
+ *
+ * If possible sets maximum payload size
+ */
+int pcie_set_mps(struct pci_dev *dev, int mps)
+{
+	int cap, err = -EINVAL;
+	u16 ctl, v;
+
+	if (mps < 128 || mps > 4096 || !is_power_of_2(mps))
+		goto out;
+
+	v = ffs(mps) - 8;
+	if (v > dev->pcie_mpss) 
+		goto out;
+	v <<= 5;
+
+	cap = pci_pcie_cap(dev);
+	if (!cap)
+		goto out;
+
+	err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
+	if (err)
+		goto out;
+
+	if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) {
+		ctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
+		ctl |= v;
+		err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl);
+	}
+out:
+	return err;
+}
+
+/**
  * pci_select_bars - Make BAR mask from the type of resource
  * @dev: the PCI device for which BAR mask is made
  * @flags: resource type mask to be selected
@@ -3498,6 +3561,10 @@  static int __init pci_setup(char *str)
 				pci_hotplug_io_size = memparse(str + 9, &str);
 			} else if (!strncmp(str, "hpmemsize=", 10)) {
 				pci_hotplug_mem_size = memparse(str + 10, &str);
+			} else if (!strncmp(str, "pcie_bus_safe", 13)) {
+				pcie_bus_config = PCIE_BUS_SAFE;
+			} else if (!strncmp(str, "pcie_bus_perf", 13)) {
+				pcie_bus_config = PCIE_BUS_PERFORMANCE;
 			} else {
 				printk(KERN_ERR "PCI: Unknown option `%s'\n",
 						str);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 48849ff..fceca7c 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -860,6 +860,8 @@  void set_pcie_port_type(struct pci_dev *pdev)
 	pdev->pcie_cap = pos;
 	pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
 	pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
+	pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
+	pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
 }
 
 void set_pcie_hotplug_bridge(struct pci_dev *pdev)
@@ -1327,6 +1329,149 @@  int pci_scan_slot(struct pci_bus *bus, int devfn)
 	return nr;
 }
 
+static int pcie_find_smpss(struct pci_dev *dev, void *data)
+{
+	u8 *smpss = data;
+
+	if (!pci_is_pcie(dev))
+		return 0;
+
+	/* For PCIE hotplug enabled slots not connected directly to a
+	 * PCI-E root port, there can be problems when hotplugging
+	 * devices.  This is due to the possibility of hotplugging a
+	 * device into the fabric with a smaller MPS that the devices
+	 * currently running have configured.  Modifying the MPS on the
+	 * running devices could cause a fatal bus error due to an
+	 * incoming frame being larger than the newly configured MPS.
+	 * To work around this, the MPS for the entire fabric must be
+	 * set to the minimum size.  Any devices hotplugged into this
+	 * fabric will have the minimum MPS set.  If the PCI hotplug
+	 * slot is directly connected to the root port and there are not
+	 * other devices on the fabric (which seems to be the most
+	 * common case), then this is not an issue and MPS discovery
+	 * will occur as normal.
+	 */
+	if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
+	    dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
+		*smpss = 0;
+
+	if (*smpss > dev->pcie_mpss)
+		*smpss = dev->pcie_mpss;
+
+	return 0;
+}
+
+static void pcie_write_mps(struct pci_dev *dev, int mps)
+{
+	int rc, dev_mpss;
+
+	dev_mpss = 128 << dev->pcie_mpss;
+
+	if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
+		if (dev->bus->self) {
+			dev_dbg(&dev->bus->dev, "Bus MPSS %d\n",
+				128 << dev->bus->self->pcie_mpss);
+
+			/* For "MPS Force Max", the assumption is made that
+			 * downstream communication will never be larger than
+			 * the MRRS.  So, the MPS only needs to be configured
+			 * for the upstream communication.  This being the case,
+			 * walk from the top down and set the MPS of the child
+			 * to that of the parent bus.
+			 */
+			mps = 128 << dev->bus->self->pcie_mpss;
+			if (mps > dev_mpss)
+				dev_warn(&dev->dev, "MPS configured higher than"
+					 " maximum supported by the device.  If"
+					 " a bus issue occurs, try running with"
+					 " pci=pcie_bus_safe.\n");
+		}
+
+		dev->pcie_mpss = ffs(mps) - 8;
+	}
+
+	rc = pcie_set_mps(dev, mps);
+	if (rc)
+		dev_err(&dev->dev, "Failed attempting to set the MPS\n");
+}
+
+static void pcie_write_mrrs(struct pci_dev *dev, int mps)
+{
+	int rc, mrrs;
+
+	if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
+		int dev_mpss = 128 << dev->pcie_mpss;
+
+		/* For Max performance, the MRRS must be set to the largest
+		 * supported value.  However, it cannot be configured larger
+		 * than the MPS the device or the bus can support.  This assumes
+		 * that the largest MRRS available on the device cannot be
+		 * smaller than the device MPSS.
+		 */
+		mrrs = mps < dev_mpss ? mps : dev_mpss;
+	} else
+		/* In the "safe" case, configure the MRRS for fairness on the
+		 * bus by making all devices have the same size
+		 */
+		mrrs = mps;
+
+
+	/* MRRS is a R/W register.  Invalid values can be written, but a
+	 * subsiquent read will verify if the value is acceptable or not.
+	 * If the MRRS value provided is not acceptable (e.g., too large),
+	 * shrink the value until it is acceptable to the HW.
+ 	 */
+	while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
+		rc = pcie_set_readrq(dev, mrrs);
+		if (rc)
+			dev_err(&dev->dev, "Failed attempting to set the MRRS\n");
+
+		mrrs /= 2;
+	}
+}
+
+static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
+{
+	int mps = 128 << *(u8 *)data;
+
+	if (!pci_is_pcie(dev))
+		return 0;
+
+	dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
+		 pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
+
+	pcie_write_mps(dev, mps);
+	pcie_write_mrrs(dev, mps);
+
+	dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
+		 pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
+
+	return 0;
+}
+
+/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down,
+ * parents then children fashion.  If this changes, then this code will not
+ * work as designed.
+ */
+void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss)
+{
+	u8 smpss = mpss;
+
+	if (!bus->self)
+		return;
+
+	if (!pci_is_pcie(bus->self))
+		return;
+
+	if (pcie_bus_config == PCIE_BUS_SAFE) {
+		pcie_find_smpss(bus->self, &smpss);
+		pci_walk_bus(bus, pcie_find_smpss, &smpss);
+	}
+
+	pcie_bus_configure_set(bus->self, &smpss);
+	pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
+}
+
 unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
 {
 	unsigned int devfn, pass, max = bus->secondary;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c446b5c..161aa45 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -251,7 +251,8 @@  struct pci_dev {
 	u8		revision;	/* PCI revision, low byte of class word */
 	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
 	u8		pcie_cap;	/* PCI-E capability offset */
-	u8		pcie_type;	/* PCI-E device/port type */
+	u8		pcie_type:4;	/* PCI-E device/port type */
+	u8		pcie_mpss:3;	/* PCI-E Max Payload Size Supported */
 	u8		rom_base_reg;	/* which config register controls the ROM */
 	u8		pin;  		/* which interrupt pin this device uses */
 
@@ -617,6 +618,16 @@  struct pci_driver {
 /* these external functions are only available when PCI support is enabled */
 #ifdef CONFIG_PCI
 
+extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss);
+
+enum pcie_bus_config_types {
+	PCIE_BUS_PERFORMANCE,
+	PCIE_BUS_SAFE,
+	PCIE_BUS_PEER2PEER,
+};
+
+extern enum pcie_bus_config_types pcie_bus_config;
+
 extern struct bus_type pci_bus_type;
 
 /* Do NOT directly access these two variables, unless you are arch specific pci
@@ -796,6 +807,8 @@  int pcix_get_mmrbc(struct pci_dev *dev);
 int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
 int pcie_get_readrq(struct pci_dev *dev);
 int pcie_set_readrq(struct pci_dev *dev, int rq);
+int pcie_get_mps(struct pci_dev *dev);
+int pcie_set_mps(struct pci_dev *dev, int mps);
 int __pci_reset_function(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, int resno);