diff mbox series

[1/2] PCI/DPC: Add 'nodpc' parameter

Message ID 1534368400-2807-1-git-send-email-jonathan.derrick@intel.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show
Series [1/2] PCI/DPC: Add 'nodpc' parameter | expand

Commit Message

Jon Derrick Aug. 15, 2018, 9:26 p.m. UTC
Some users may want to disable downstream port containment (DPC), so
give them this option

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
---
 drivers/pci/pci.c      |  2 ++
 drivers/pci/pci.h      |  6 ++++++
 drivers/pci/pcie/dpc.c | 48 ++++++++++++++++++++++++++++++++++++++----------
 include/linux/pci.h    |  6 ++++++
 4 files changed, 52 insertions(+), 10 deletions(-)

Comments

Keith Busch Aug. 15, 2018, 11:03 p.m. UTC | #1
On Wed, Aug 15, 2018 at 03:26:39PM -0600, Jon Derrick wrote:
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -536,4 +536,10 @@ static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
>  static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
>  #endif
>  
> +#ifdef CONFIG_PCIE_DPC
> +void pci_no_dpc(void);
> +#else
> +static inline void pci_no_dpc(void) { }
> +#endif

<snip>

> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1473,6 +1473,12 @@ static inline int pci_irqd_intx_xlate(struct irq_domain *d,
>  static inline bool pci_aer_available(void) { return false; }
>  #endif
>  
> +#ifdef CONFIG_PCIE_DPC
> +bool pci_dpc_available(void);
> +#else
> +static inline bool pci_dpc_available(void) { return false; }
> +#endif

Seems like these two sections belong together. Otherwise, looks fine.
Oza Pawandeep Aug. 16, 2018, 9:27 a.m. UTC | #2
On 2018-08-16 02:56, Jon Derrick wrote:
> Some users may want to disable downstream port containment (DPC), so
> give them this option
> 
> Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> ---
>  drivers/pci/pci.c      |  2 ++
>  drivers/pci/pci.h      |  6 ++++++
>  drivers/pci/pcie/dpc.c | 48 
> ++++++++++++++++++++++++++++++++++++++----------
>  include/linux/pci.h    |  6 ++++++
>  4 files changed, 52 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 80da484..4e629c2 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -6062,6 +6062,8 @@ static int __init pci_setup(char *str)
>  				pcie_ats_disabled = true;
>  			} else if (!strcmp(str, "noaer")) {
>  				pci_no_aer();
> +			} else if (!strcmp(str, "nodpc")) {
> +				pci_no_dpc();
>  			} else if (!strcmp(str, "earlydump")) {
>  				pci_early_dump = true;
>  			} else if (!strncmp(str, "realloc=", 8)) {
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 6e0d152..f73f29e 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -536,4 +536,10 @@ static inline void
> pci_aer_clear_fatal_status(struct pci_dev *dev) { }
>  static inline void pci_aer_clear_device_status(struct pci_dev *dev) { 
> }
>  #endif
> 
> +#ifdef CONFIG_PCIE_DPC
> +void pci_no_dpc(void);
> +#else
> +static inline void pci_no_dpc(void) { }
> +#endif
> +
>  #endif /* DRIVERS_PCI_H */
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index f03279f..068fca0 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -44,6 +44,18 @@ struct dpc_dev {
>  	"Memory Request Completion Timeout",		 /* Bit Position 18 */
>  };
> 
> +static int pcie_dpc_disable;
> +
> +void pci_no_dpc(void)
> +{
> +	pcie_dpc_disable = 1;
> +}
> +
> +bool pci_dpc_available(void)
> +{
> +	return !pcie_dpc_disable && pci_aer_available() && pci_msi_enabled();

1) why do you check pci_aer_available() ?
2) and pci_aer_available() already internally checks pci_msi_enabled();

> +}
> +
>  static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
>  {
>  	unsigned long timeout = jiffies + HZ;
> @@ -209,6 +221,17 @@ static irqreturn_t dpc_irq(int irq, void *context)
>  	return IRQ_HANDLED;
>  }
> 
> +static void dpc_disable(struct pci_dev *pdev)
> +{
> +	u16 cap_pos, ctl;
> +
> +	cap_pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
> +	pci_read_config_word(pdev, cap_pos + PCI_EXP_DPC_CTL, &ctl);
> +	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_EN_NONFATAL |
> +		 PCI_EXP_DPC_CTL_INT_EN);
> +	pci_write_config_word(pdev, cap_pos + PCI_EXP_DPC_CTL, ctl);
> +}
> +
>  #define FLAG(x, y) (((x) & (y)) ? '+' : '-')
>  static int dpc_probe(struct pcie_device *dev)
>  {
> @@ -221,9 +244,16 @@ static int dpc_probe(struct pcie_device *dev)
>  	if (pcie_aer_get_firmware_first(pdev))
>  		return -ENOTSUPP;
> 
> +	if (!pci_dpc_available()) {
> +		status = -ENOTSUPP;
> +		goto disable_dpc;
> +	}
> +
>  	dpc = devm_kzalloc(device, sizeof(*dpc), GFP_KERNEL);
> -	if (!dpc)
> -		return -ENOMEM;
> +	if (!dpc) {
> +		status = -ENOMEM;
> +		goto disable_dpc;
> +	}
> 
>  	dpc->cap_pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
>  	dpc->dev = dev;
> @@ -235,7 +265,7 @@ static int dpc_probe(struct pcie_device *dev)
>  	if (status) {
>  		dev_warn(device, "request IRQ%d failed: %d\n", dev->irq,
>  			 status);
> -		return status;
> +		goto disable_dpc;
>  	}
> 
>  	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
> @@ -260,17 +290,15 @@ static int dpc_probe(struct pcie_device *dev)
>  		FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), dpc->rp_log_size,
>  		FLAG(cap, PCI_EXP_DPC_CAP_DL_ACTIVE));
>  	return status;
> +
> +disable_dpc:
> +	dpc_disable(pdev);
> +	return status;
>  }
> 
>  static void dpc_remove(struct pcie_device *dev)
>  {
> -	struct dpc_dev *dpc = get_service_data(dev);
> -	struct pci_dev *pdev = dev->port;
> -	u16 ctl;
> -
> -	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
> -	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
> -	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
> +	dpc_disable(dev->port);
>  }
> 
>  static struct pcie_port_service_driver dpcdriver = {
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 5454e6b..559b792 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1473,6 +1473,12 @@ static inline int pci_irqd_intx_xlate(struct
> irq_domain *d,
>  static inline bool pci_aer_available(void) { return false; }
>  #endif
> 
> +#ifdef CONFIG_PCIE_DPC
> +bool pci_dpc_available(void);
> +#else
> +static inline bool pci_dpc_available(void) { return false; }
> +#endif
> +
>  #ifdef CONFIG_PCIE_ECRC
>  void pcie_set_ecrc_checking(struct pci_dev *dev);
>  void pcie_ecrc_get_policy(char *str);
Jon Derrick Aug. 16, 2018, 3:45 p.m. UTC | #3
On Thu, 2018-08-16 at 14:57 +0530, poza@codeaurora.org wrote:
> On 2018-08-16 02:56, Jon Derrick wrote:
> > Some users may want to disable downstream port containment (DPC),
> > so
> > give them this option
> > 
> > Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> > ---
> >  drivers/pci/pci.c      |  2 ++
> >  drivers/pci/pci.h      |  6 ++++++
> >  drivers/pci/pcie/dpc.c | 48 
> > ++++++++++++++++++++++++++++++++++++++----------
> >  include/linux/pci.h    |  6 ++++++
> >  4 files changed, 52 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > index 80da484..4e629c2 100644
> > --- a/drivers/pci/pci.c
> > +++ b/drivers/pci/pci.c
> > @@ -6062,6 +6062,8 @@ static int __init pci_setup(char *str)
> >  				pcie_ats_disabled = true;
> >  			} else if (!strcmp(str, "noaer")) {
> >  				pci_no_aer();
> > +			} else if (!strcmp(str, "nodpc")) {
> > +				pci_no_dpc();
> >  			} else if (!strcmp(str, "earlydump")) {
> >  				pci_early_dump = true;
> >  			} else if (!strncmp(str, "realloc=", 8)) {
> > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> > index 6e0d152..f73f29e 100644
> > --- a/drivers/pci/pci.h
> > +++ b/drivers/pci/pci.h
> > @@ -536,4 +536,10 @@ static inline void
> > pci_aer_clear_fatal_status(struct pci_dev *dev) { }
> >  static inline void pci_aer_clear_device_status(struct pci_dev
> > *dev) { 
> > }
> >  #endif
> > 
> > +#ifdef CONFIG_PCIE_DPC
> > +void pci_no_dpc(void);
> > +#else
> > +static inline void pci_no_dpc(void) { }
> > +#endif
> > +
> >  #endif /* DRIVERS_PCI_H */
> > diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> > index f03279f..068fca0 100644
> > --- a/drivers/pci/pcie/dpc.c
> > +++ b/drivers/pci/pcie/dpc.c
> > @@ -44,6 +44,18 @@ struct dpc_dev {
> >  	"Memory Request Completion Timeout",		 /*
> > Bit Position 18 */
> >  };
> > 
> > +static int pcie_dpc_disable;
> > +
> > +void pci_no_dpc(void)
> > +{
> > +	pcie_dpc_disable = 1;
> > +}
> > +
> > +bool pci_dpc_available(void)
> > +{
> > +	return !pcie_dpc_disable && pci_aer_available() &&
> > pci_msi_enabled();
> 
> 1) why do you check pci_aer_available() ?
I'd assumed a dependency on aer, but looking at it again I see it calls
into err.c without a requirement on aer.

Then it also seems the pci_aer_available() and AER dependency check in
portdrv_core.c is unneccessary too.

> 2) and pci_aer_available() already internally checks
> pci_msi_enabled();
> 
> > +}
> > +
> >  static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
> >  {
> >  	unsigned long timeout = jiffies + HZ;
> > @@ -209,6 +221,17 @@ static irqreturn_t dpc_irq(int irq, void
> > *context)
> >  	return IRQ_HANDLED;
> >  }
> > 
> > +static void dpc_disable(struct pci_dev *pdev)
> > +{
> > +	u16 cap_pos, ctl;
> > +
> > +	cap_pos = pci_find_ext_capability(pdev,
> > PCI_EXT_CAP_ID_DPC);
> > +	pci_read_config_word(pdev, cap_pos + PCI_EXP_DPC_CTL,
> > &ctl);
> > +	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL |
> > PCI_EXP_DPC_CTL_EN_NONFATAL |
> > +		 PCI_EXP_DPC_CTL_INT_EN);
> > +	pci_write_config_word(pdev, cap_pos + PCI_EXP_DPC_CTL,
> > ctl);
> > +}
> > +
> >  #define FLAG(x, y) (((x) & (y)) ? '+' : '-')
> >  static int dpc_probe(struct pcie_device *dev)
> >  {
> > @@ -221,9 +244,16 @@ static int dpc_probe(struct pcie_device *dev)
> >  	if (pcie_aer_get_firmware_first(pdev))
> >  		return -ENOTSUPP;
> > 
> > +	if (!pci_dpc_available()) {
> > +		status = -ENOTSUPP;
> > +		goto disable_dpc;
> > +	}
> > +
> >  	dpc = devm_kzalloc(device, sizeof(*dpc), GFP_KERNEL);
> > -	if (!dpc)
> > -		return -ENOMEM;
> > +	if (!dpc) {
> > +		status = -ENOMEM;
> > +		goto disable_dpc;
> > +	}
> > 
> >  	dpc->cap_pos = pci_find_ext_capability(pdev,
> > PCI_EXT_CAP_ID_DPC);
> >  	dpc->dev = dev;
> > @@ -235,7 +265,7 @@ static int dpc_probe(struct pcie_device *dev)
> >  	if (status) {
> >  		dev_warn(device, "request IRQ%d failed: %d\n",
> > dev->irq,
> >  			 status);
> > -		return status;
> > +		goto disable_dpc;
> >  	}
> > 
> >  	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP,
> > &cap);
> > @@ -260,17 +290,15 @@ static int dpc_probe(struct pcie_device *dev)
> >  		FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), dpc-
> > >rp_log_size,
> >  		FLAG(cap, PCI_EXP_DPC_CAP_DL_ACTIVE));
> >  	return status;
> > +
> > +disable_dpc:
> > +	dpc_disable(pdev);
> > +	return status;
> >  }
> > 
> >  static void dpc_remove(struct pcie_device *dev)
> >  {
> > -	struct dpc_dev *dpc = get_service_data(dev);
> > -	struct pci_dev *pdev = dev->port;
> > -	u16 ctl;
> > -
> > -	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL,
> > &ctl);
> > -	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL |
> > PCI_EXP_DPC_CTL_INT_EN);
> > -	pci_write_config_word(pdev, dpc->cap_pos +
> > PCI_EXP_DPC_CTL, ctl);
> > +	dpc_disable(dev->port);
> >  }
> > 
> >  static struct pcie_port_service_driver dpcdriver = {
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index 5454e6b..559b792 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -1473,6 +1473,12 @@ static inline int pci_irqd_intx_xlate(struct
> > irq_domain *d,
> >  static inline bool pci_aer_available(void) { return false; }
> >  #endif
> > 
> > +#ifdef CONFIG_PCIE_DPC
> > +bool pci_dpc_available(void);
> > +#else
> > +static inline bool pci_dpc_available(void) { return false; }
> > +#endif
> > +
> >  #ifdef CONFIG_PCIE_ECRC
> >  void pcie_set_ecrc_checking(struct pci_dev *dev);
> >  void pcie_ecrc_get_policy(char *str);
Matthew Wilcox Aug. 16, 2018, 3:49 p.m. UTC | #4
On Wed, Aug 15, 2018 at 03:26:39PM -0600, Jon Derrick wrote:
> Some users may want to disable downstream port containment (DPC), so
> give them this option

Is it possible they might only want to disable DPC on a subset of the
hierarchy rather than globally?
Jon Derrick Aug. 16, 2018, 3:50 p.m. UTC | #5
On Thu, 2018-08-16 at 08:49 -0700, Matthew Wilcox wrote:
> On Wed, Aug 15, 2018 at 03:26:39PM -0600, Jon Derrick wrote:
> > Some users may want to disable downstream port containment (DPC),
> > so
> > give them this option
> 
> Is it possible they might only want to disable DPC on a subset of the
> hierarchy rather than globally?

Absolutely. I was hoping Logan's pci dev_str would land because I have
a few others I want to convert to that api for granular tuning
Bjorn Helgaas Aug. 16, 2018, 8:31 p.m. UTC | #6
On Thu, Aug 16, 2018 at 03:50:47PM +0000, Derrick, Jonathan wrote:
> On Thu, 2018-08-16 at 08:49 -0700, Matthew Wilcox wrote:
> > On Wed, Aug 15, 2018 at 03:26:39PM -0600, Jon Derrick wrote:
> > > Some users may want to disable downstream port containment (DPC),
> > > so
> > > give them this option
> > 
> > Is it possible they might only want to disable DPC on a subset of the
> > hierarchy rather than globally?
> 
> Absolutely. I was hoping Logan's pci dev_str would land because I have
> a few others I want to convert to that api for granular tuning

What's the use case here?  I acknowledge there are cases where we need
them, but I'm not a fan of kernel parameters in general because
they're a real hassle for users.

Is there something wrong with DPC?  Is there some way we can make it
smarter so it does the right thing automatically?

I'm more OK with a blanket "nodpc" switch intended for debugging.
If we add the complexity of subsets of the hierarchy it starts
sounding like an administrative thing that makes me more hesitant. 

Could this be done via a sysfs switch instead?  That potentially could
work for hot-added things where a kernel parameter doesn't work so
well.

Please squash the doc patch and the code change so it's easier to keep
them together.

Bjorn
Jon Derrick Aug. 16, 2018, 8:50 p.m. UTC | #7
Hi Bjorn,

On Thu, 2018-08-16 at 15:31 -0500, Bjorn Helgaas wrote:
> On Thu, Aug 16, 2018 at 03:50:47PM +0000, Derrick, Jonathan wrote:
> > On Thu, 2018-08-16 at 08:49 -0700, Matthew Wilcox wrote:
> > > On Wed, Aug 15, 2018 at 03:26:39PM -0600, Jon Derrick wrote:
> > > > Some users may want to disable downstream port containment
> > > > (DPC),
> > > > so
> > > > give them this option
> > > 
> > > Is it possible they might only want to disable DPC on a subset of
> > > the
> > > hierarchy rather than globally?
> > 
> > Absolutely. I was hoping Logan's pci dev_str would land because I
> > have
> > a few others I want to convert to that api for granular tuning
> 
> What's the use case here?  I acknowledge there are cases where we
> need
> them, but I'm not a fan of kernel parameters in general because
> they're a real hassle for users.
> 
> Is there something wrong with DPC?  Is there some way we can make it
> smarter so it does the right thing automatically?
I've encountered some BIOS or switches (not sure who) who've appeared
to have enabled DPC by default, prior to kernel setup. Some users may
just not want this, but it was primarily intended for debugging when I
conceived it.

There's also the ongoing thread in linux-pci about err handling in PPC
EEH, who may also desire to disable DPC until those issues are
resolved.


It can also be disabled with setpci, but is that any less of a hassle?
Genuine question to understand your point of view.


> 
> I'm more OK with a blanket "nodpc" switch intended for debugging.
> If we add the complexity of subsets of the hierarchy it starts
> sounding like an administrative thing that makes me more hesitant. 
> 
That's ok with me. I was having trouble cleanly keeping the string
around after early_param anyways.


To again understand your point of view, is there anything wrong with
administrative things in kernel boot parameters? There will be cases
where we may want to deviate from default or global pci=* parameters
for certain hierarchies and they can't necessarily be set after the
fact (ex, hpmemsize).


> Could this be done via a sysfs switch instead?  That potentially
> could
> work for hot-added things where a kernel parameter doesn't work so
> well.
> 
> Please squash the doc patch and the code change so it's easier to
> keep
> them together.
> 
> Bjorn
Keith Busch Aug. 16, 2018, 9:19 p.m. UTC | #8
On Thu, Aug 16, 2018 at 01:50:42PM -0700, Derrick, Jonathan wrote:
> It can also be disabled with setpci, but is that any less of a hassle?
> Genuine question to understand your point of view.

That is not a real solution, IMO. 'setpci' is good to inject things
for testing, but it changes config space without the kernel aware that
you've done that, so it is inherently racey with other kernel threads
touching pci config space. And the kernel or platform may end up undoing
what you had 'setpci' do anyway with no immediate way to be notified it
was changed.
Jon Derrick Aug. 16, 2018, 9:28 p.m. UTC | #9
On Thu, 2018-08-16 at 15:19 -0600, Keith Busch wrote:
> On Thu, Aug 16, 2018 at 01:50:42PM -0700, Derrick, Jonathan wrote:
> > It can also be disabled with setpci, but is that any less of a
> > hassle?
> > Genuine question to understand your point of view.
> 
> That is not a real solution, IMO. 'setpci' is good to inject things
> for testing, but it changes config space without the kernel aware
> that
> you've done that, so it is inherently racey with other kernel threads
> touching pci config space. And the kernel or platform may end up
> undoing
> what you had 'setpci' do anyway with no immediate way to be notified
> it
> was changed.

It sounds like a sysfs toggle could be useful regardless of the boot
parameter.
Bjorn Helgaas Aug. 17, 2018, 2:25 p.m. UTC | #10
On Thu, Aug 16, 2018 at 08:50:42PM +0000, Derrick, Jonathan wrote:
> On Thu, 2018-08-16 at 15:31 -0500, Bjorn Helgaas wrote:
> > On Thu, Aug 16, 2018 at 03:50:47PM +0000, Derrick, Jonathan wrote:
> > > On Thu, 2018-08-16 at 08:49 -0700, Matthew Wilcox wrote:
> > > > On Wed, Aug 15, 2018 at 03:26:39PM -0600, Jon Derrick wrote:
> > > > > Some users may want to disable downstream port containment
> > > > > (DPC),
> > > > > so
> > > > > give them this option
> > > > 
> > > > Is it possible they might only want to disable DPC on a subset of
> > > > the
> > > > hierarchy rather than globally?
> > > 
> > > Absolutely. I was hoping Logan's pci dev_str would land because I
> > > have
> > > a few others I want to convert to that api for granular tuning
> > 
> > What's the use case here?  I acknowledge there are cases where we
> > need
> > them, but I'm not a fan of kernel parameters in general because
> > they're a real hassle for users.
> > 
> > Is there something wrong with DPC?  Is there some way we can make it
> > smarter so it does the right thing automatically?
> I've encountered some BIOS or switches (not sure who) who've appeared
> to have enabled DPC by default, prior to kernel setup. Some users may
> just not want this, but it was primarily intended for debugging when I
> conceived it.
> 
> There's also the ongoing thread in linux-pci about err handling in PPC
> EEH, who may also desire to disable DPC until those issues are
> resolved.

I haven't caught up on that thread yet.  If DPC is incompatible with
PPC EEH, there's always the possibility of a switch in the code to
disable DPC automatically on PPC.

> It can also be disabled with setpci, but is that any less of a hassle?
> Genuine question to understand your point of view.

Keith already answered here; setpci is primarily for debugging and
shouldn't be recommended as normal practice.

> > I'm more OK with a blanket "nodpc" switch intended for debugging.
> > If we add the complexity of subsets of the hierarchy it starts
> > sounding like an administrative thing that makes me more hesitant. 
> ...
> To again understand your point of view, is there anything wrong with
> administrative things in kernel boot parameters? There will be cases
> where we may want to deviate from default or global pci=* parameters
> for certain hierarchies and they can't necessarily be set after the
> fact (ex, hpmemsize).

"There will be cases" sounds like we're doing something that *might*
be useful in the future.  It's better if we wait until we actually
discover a need for something.

There's also a tendency among users to trip over a problem, discover a
boot parameter like "pci=nomsi", and conclude that the problem is
"fixed".  In fact, we want the bug report so we can fix the kernel so
no boot parameter is needed.

I agree there are things that can't be set after boot.  Is this one of
them?  This seems like something that could be controlled at run-time.
But even there, I will ask what the requirement for it is, because we
don't want to clutter sysfs with things only needed for debugging.

Boot parameters are a hassle because it's hard to build a user
interface on top of them, and they require a reboot to take effect.
Jon Derrick Aug. 17, 2018, 2:45 p.m. UTC | #11
Hi Bjorn,

Thanks for your pov.

I'm going a lot of different ways on this one, but it seems like the
most reasonable option right now is to drop it until someone actually
presents a non-debugging need to disable DPC from sysfs or globally.

Best regards,
Jon

On Fri, 2018-08-17 at 09:25 -0500, Bjorn Helgaas wrote:
> On Thu, Aug 16, 2018 at 08:50:42PM +0000, Derrick, Jonathan wrote:
> > On Thu, 2018-08-16 at 15:31 -0500, Bjorn Helgaas wrote:
> > > On Thu, Aug 16, 2018 at 03:50:47PM +0000, Derrick, Jonathan
> > > wrote:
> > > > On Thu, 2018-08-16 at 08:49 -0700, Matthew Wilcox wrote:
> > > > > On Wed, Aug 15, 2018 at 03:26:39PM -0600, Jon Derrick wrote:
> > > > > > Some users may want to disable downstream port containment
> > > > > > (DPC),
> > > > > > so
> > > > > > give them this option
> > > > > 
> > > > > Is it possible they might only want to disable DPC on a
> > > > > subset of
> > > > > the
> > > > > hierarchy rather than globally?
> > > > 
> > > > Absolutely. I was hoping Logan's pci dev_str would land because
> > > > I
> > > > have
> > > > a few others I want to convert to that api for granular tuning
> > > 
> > > What's the use case here?  I acknowledge there are cases where we
> > > need
> > > them, but I'm not a fan of kernel parameters in general because
> > > they're a real hassle for users.
> > > 
> > > Is there something wrong with DPC?  Is there some way we can make
> > > it
> > > smarter so it does the right thing automatically?
> > 
> > I've encountered some BIOS or switches (not sure who) who've
> > appeared
> > to have enabled DPC by default, prior to kernel setup. Some users
> > may
> > just not want this, but it was primarily intended for debugging
> > when I
> > conceived it.
> > 
> > There's also the ongoing thread in linux-pci about err handling in
> > PPC
> > EEH, who may also desire to disable DPC until those issues are
> > resolved.
> 
> I haven't caught up on that thread yet.  If DPC is incompatible with
> PPC EEH, there's always the possibility of a switch in the code to
> disable DPC automatically on PPC.
> 
> > It can also be disabled with setpci, but is that any less of a
> > hassle?
> > Genuine question to understand your point of view.
> 
> Keith already answered here; setpci is primarily for debugging and
> shouldn't be recommended as normal practice.
> 
> > > I'm more OK with a blanket "nodpc" switch intended for debugging.
> > > If we add the complexity of subsets of the hierarchy it starts
> > > sounding like an administrative thing that makes me more
> > > hesitant. 
> > 
> > ...
> > To again understand your point of view, is there anything wrong
> > with
> > administrative things in kernel boot parameters? There will be
> > cases
> > where we may want to deviate from default or global pci=*
> > parameters
> > for certain hierarchies and they can't necessarily be set after the
> > fact (ex, hpmemsize).
> 
> "There will be cases" sounds like we're doing something that *might*
> be useful in the future.  It's better if we wait until we actually
> discover a need for something.
> 
> There's also a tendency among users to trip over a problem, discover
> a
> boot parameter like "pci=nomsi", and conclude that the problem is
> "fixed".  In fact, we want the bug report so we can fix the kernel so
> no boot parameter is needed.
> 
> I agree there are things that can't be set after boot.  Is this one
> of
> them?  This seems like something that could be controlled at run-
> time.
> But even there, I will ask what the requirement for it is, because we
> don't want to clutter sysfs with things only needed for debugging.
> 
> Boot parameters are a hassle because it's hard to build a user
> interface on top of them, and they require a reboot to take effect.
diff mbox series

Patch

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 80da484..4e629c2 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -6062,6 +6062,8 @@  static int __init pci_setup(char *str)
 				pcie_ats_disabled = true;
 			} else if (!strcmp(str, "noaer")) {
 				pci_no_aer();
+			} else if (!strcmp(str, "nodpc")) {
+				pci_no_dpc();
 			} else if (!strcmp(str, "earlydump")) {
 				pci_early_dump = true;
 			} else if (!strncmp(str, "realloc=", 8)) {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 6e0d152..f73f29e 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -536,4 +536,10 @@  static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
 static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
 #endif
 
+#ifdef CONFIG_PCIE_DPC
+void pci_no_dpc(void);
+#else
+static inline void pci_no_dpc(void) { }
+#endif
+
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index f03279f..068fca0 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -44,6 +44,18 @@  struct dpc_dev {
 	"Memory Request Completion Timeout",		 /* Bit Position 18 */
 };
 
+static int pcie_dpc_disable;
+
+void pci_no_dpc(void)
+{
+	pcie_dpc_disable = 1;
+}
+
+bool pci_dpc_available(void)
+{
+	return !pcie_dpc_disable && pci_aer_available() && pci_msi_enabled();
+}
+
 static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 {
 	unsigned long timeout = jiffies + HZ;
@@ -209,6 +221,17 @@  static irqreturn_t dpc_irq(int irq, void *context)
 	return IRQ_HANDLED;
 }
 
+static void dpc_disable(struct pci_dev *pdev)
+{
+	u16 cap_pos, ctl;
+
+	cap_pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
+	pci_read_config_word(pdev, cap_pos + PCI_EXP_DPC_CTL, &ctl);
+	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_EN_NONFATAL |
+		 PCI_EXP_DPC_CTL_INT_EN);
+	pci_write_config_word(pdev, cap_pos + PCI_EXP_DPC_CTL, ctl);
+}
+
 #define FLAG(x, y) (((x) & (y)) ? '+' : '-')
 static int dpc_probe(struct pcie_device *dev)
 {
@@ -221,9 +244,16 @@  static int dpc_probe(struct pcie_device *dev)
 	if (pcie_aer_get_firmware_first(pdev))
 		return -ENOTSUPP;
 
+	if (!pci_dpc_available()) {
+		status = -ENOTSUPP;
+		goto disable_dpc;
+	}
+
 	dpc = devm_kzalloc(device, sizeof(*dpc), GFP_KERNEL);
-	if (!dpc)
-		return -ENOMEM;
+	if (!dpc) {
+		status = -ENOMEM;
+		goto disable_dpc;
+	}
 
 	dpc->cap_pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC);
 	dpc->dev = dev;
@@ -235,7 +265,7 @@  static int dpc_probe(struct pcie_device *dev)
 	if (status) {
 		dev_warn(device, "request IRQ%d failed: %d\n", dev->irq,
 			 status);
-		return status;
+		goto disable_dpc;
 	}
 
 	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
@@ -260,17 +290,15 @@  static int dpc_probe(struct pcie_device *dev)
 		FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), dpc->rp_log_size,
 		FLAG(cap, PCI_EXP_DPC_CAP_DL_ACTIVE));
 	return status;
+
+disable_dpc:
+	dpc_disable(pdev);
+	return status;
 }
 
 static void dpc_remove(struct pcie_device *dev)
 {
-	struct dpc_dev *dpc = get_service_data(dev);
-	struct pci_dev *pdev = dev->port;
-	u16 ctl;
-
-	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
-	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
-	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
+	dpc_disable(dev->port);
 }
 
 static struct pcie_port_service_driver dpcdriver = {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 5454e6b..559b792 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1473,6 +1473,12 @@  static inline int pci_irqd_intx_xlate(struct irq_domain *d,
 static inline bool pci_aer_available(void) { return false; }
 #endif
 
+#ifdef CONFIG_PCIE_DPC
+bool pci_dpc_available(void);
+#else
+static inline bool pci_dpc_available(void) { return false; }
+#endif
+
 #ifdef CONFIG_PCIE_ECRC
 void pcie_set_ecrc_checking(struct pci_dev *dev);
 void pcie_ecrc_get_policy(char *str);