diff mbox series

[v2] PCI: Introduce flag for detached virtual functions

Message ID 1597260071-2219-2-git-send-email-mjrosato@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series [v2] PCI: Introduce flag for detached virtual functions | expand

Commit Message

Matthew Rosato Aug. 12, 2020, 7:21 p.m. UTC
s390x has the notion of providing VFs to the kernel in a manner
where the associated PF is inaccessible other than via firmware.
These are not treated as typical VFs and access to them is emulated
by underlying firmware which can still access the PF.  After
abafbc55 however these detached VFs were no longer able to work
with vfio-pci as the firmware does not provide emulation of the
PCI_COMMAND_MEMORY bit.  In this case, let's explicitly recognize
these detached VFs so that vfio-pci can allow memory access to
them again.

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 arch/s390/pci/pci.c                |  8 ++++++++
 drivers/vfio/pci/vfio_pci_config.c | 11 +++++++----
 include/linux/pci.h                |  1 +
 3 files changed, 16 insertions(+), 4 deletions(-)

Comments

Alex Williamson Aug. 12, 2020, 8:32 p.m. UTC | #1
On Wed, 12 Aug 2020 15:21:11 -0400
Matthew Rosato <mjrosato@linux.ibm.com> wrote:

> s390x has the notion of providing VFs to the kernel in a manner
> where the associated PF is inaccessible other than via firmware.
> These are not treated as typical VFs and access to them is emulated
> by underlying firmware which can still access the PF.  After
> abafbc55 however these detached VFs were no longer able to work
> with vfio-pci as the firmware does not provide emulation of the
> PCI_COMMAND_MEMORY bit.  In this case, let's explicitly recognize
> these detached VFs so that vfio-pci can allow memory access to
> them again.
> 

Might as well include a fixes tag too.

Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory")

You might also extend the sha1 in the log to 12 chars as well, or
replace it with a reference to the fixes tag.

> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> ---
>  arch/s390/pci/pci.c                |  8 ++++++++
>  drivers/vfio/pci/vfio_pci_config.c | 11 +++++++----
>  include/linux/pci.h                |  1 +
>  3 files changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> index 3902c9f..04ac76d 100644
> --- a/arch/s390/pci/pci.c
> +++ b/arch/s390/pci/pci.c
> @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
>  {
>  	struct zpci_dev *zdev = to_zpci(pdev);
>  
> +	/*
> +	 * If we have a VF on a non-multifunction bus, it must be a VF that is
> +	 * detached from its parent PF.  We rely on firmware emulation to
> +	 * provide underlying PF details.
> +	 */
> +	if (zdev->vfn && !zdev->zbus->multifunction)
> +		pdev->detached_vf = 1;
> +
>  	zpci_debug_init_device(zdev, dev_name(&pdev->dev));
>  	zpci_fmb_enable_device(zdev);
>  
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index d98843f..ee45216 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -406,7 +406,8 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev)
>  	 * PF SR-IOV capability, there's therefore no need to trigger
>  	 * faults based on the virtual value.
>  	 */
> -	return pdev->is_virtfn || (cmd & PCI_COMMAND_MEMORY);
> +	return pdev->is_virtfn || pdev->detached_vf ||
> +	       (cmd & PCI_COMMAND_MEMORY);
>  }
>  
>  /*
> @@ -420,7 +421,7 @@ static void vfio_bar_restore(struct vfio_pci_device *vdev)
>  	u16 cmd;
>  	int i;
>  
> -	if (pdev->is_virtfn)
> +	if (pdev->is_virtfn || pdev->detached_vf)
>  		return;
>  
>  	pci_info(pdev, "%s: reset recovery - restoring BARs\n", __func__);
> @@ -521,7 +522,8 @@ static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
>  	count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
>  
>  	/* Mask in virtual memory enable for SR-IOV devices */
> -	if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) {
> +	if ((offset == PCI_COMMAND) &&
> +	    (vdev->pdev->is_virtfn || vdev->pdev->detached_vf)) {
>  		u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
>  		u32 tmp_val = le32_to_cpu(*val);
>  
> @@ -1734,7 +1736,8 @@ int vfio_config_init(struct vfio_pci_device *vdev)
>  				 vconfig[PCI_INTERRUPT_PIN]);
>  
>  		vconfig[PCI_INTERRUPT_PIN] = 0; /* Gratuitous for good VFs */
> -
> +	}
> +	if (pdev->is_virtfn || pdev->detached_vf) {
>  		/*
>  		 * VFs do no implement the memory enable bit of the COMMAND
>  		 * register therefore we'll not have it set in our initial
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 8355306..23a6972 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -445,6 +445,7 @@ struct pci_dev {
>  	unsigned int	is_probed:1;		/* Device probing in progress */
>  	unsigned int	link_active_reporting:1;/* Device capable of reporting link active */
>  	unsigned int	no_vf_scan:1;		/* Don't scan for VFs after IOV enablement */
> +	unsigned int	detached_vf:1;		/* VF without local PF access */

Is there too much implicit knowledge in defining a "detached VF"?  For
example, why do we know that we can skip the portion of
vfio_config_init() that copies the vendor and device IDs from the
struct pci_dev into the virtual config space?  It's true on s390x, but
I think that's because we know that firmware emulates those registers
for us.  We also skip the INTx pin register sanity checking.  Do we do
that because we haven't installed the broken device into an s390x
system?  Because we know firmware manages that for us too?  Or simply
because s390x doesn't support INTx anyway, and therefore it's another
architecture implicit decision?

If detached_vf is really equivalent to is_virtfn for all cases that
don't care about referencing physfn on the pci_dev, then we should
probably have a macro to that effect.  Otherwise, if we're just trying
to describe that the memory bit of the command register is
unimplemented but always enabled, like a VF, should we specifically
describe that attribute instead?  If so, should we instead do that with
pci_dev_flags_t?  Thanks,

Alex

>  	pci_dev_flags_t dev_flags;
>  	atomic_t	enable_cnt;	/* pci_enable_device has been called */
>
Oliver O'Halloran Aug. 13, 2020, 1:55 a.m. UTC | #2
On Thu, Aug 13, 2020 at 5:21 AM Matthew Rosato <mjrosato@linux.ibm.com> wrote:
>
> s390x has the notion of providing VFs to the kernel in a manner
> where the associated PF is inaccessible other than via firmware.
> These are not treated as typical VFs and access to them is emulated
> by underlying firmware which can still access the PF.  After
> abafbc55 however these detached VFs were no longer able to work
> with vfio-pci as the firmware does not provide emulation of the
> PCI_COMMAND_MEMORY bit.  In this case, let's explicitly recognize
> these detached VFs so that vfio-pci can allow memory access to
> them again.

Hmm, cool. I think we have a similar feature on pseries so that's
probably broken too.

> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> ---
>  arch/s390/pci/pci.c                |  8 ++++++++
>  drivers/vfio/pci/vfio_pci_config.c | 11 +++++++----
>  include/linux/pci.h                |  1 +
>  3 files changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> index 3902c9f..04ac76d 100644
> --- a/arch/s390/pci/pci.c
> +++ b/arch/s390/pci/pci.c
> @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
>  {
>         struct zpci_dev *zdev = to_zpci(pdev);
>
> +       /*
> +        * If we have a VF on a non-multifunction bus, it must be a VF that is
> +        * detached from its parent PF.  We rely on firmware emulation to
> +        * provide underlying PF details.
> +        */
> +       if (zdev->vfn && !zdev->zbus->multifunction)
> +               pdev->detached_vf = 1;

The enable hook seems like it's a bit too late for this sort of
screwing around with the pci_dev. Anything in the setup path that
looks at ->detached_vf would see it cleared while anything that looks
after the device is enabled will see it set. Can this go into
pcibios_add_device() or a fixup instead?
Oliver O'Halloran Aug. 13, 2020, 1:59 a.m. UTC | #3
On Thu, Aug 13, 2020 at 6:33 AM Alex Williamson
<alex.williamson@redhat.com> wrote:
>
> On Wed, 12 Aug 2020 15:21:11 -0400
> Matthew Rosato <mjrosato@linux.ibm.com> wrote:
>
> > @@ -521,7 +522,8 @@ static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
> >       count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
> >
> >       /* Mask in virtual memory enable for SR-IOV devices */
> > -     if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) {
> > +     if ((offset == PCI_COMMAND) &&
> > +         (vdev->pdev->is_virtfn || vdev->pdev->detached_vf)) {
> >               u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
> >               u32 tmp_val = le32_to_cpu(*val);
> >
> > @@ -1734,7 +1736,8 @@ int vfio_config_init(struct vfio_pci_device *vdev)
> >                                vconfig[PCI_INTERRUPT_PIN]);
> >
> >               vconfig[PCI_INTERRUPT_PIN] = 0; /* Gratuitous for good VFs */
> > -
> > +     }
> > +     if (pdev->is_virtfn || pdev->detached_vf) {
> >               /*
> >                * VFs do no implement the memory enable bit of the COMMAND
> >                * register therefore we'll not have it set in our initial
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index 8355306..23a6972 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -445,6 +445,7 @@ struct pci_dev {
> >       unsigned int    is_probed:1;            /* Device probing in progress */
> >       unsigned int    link_active_reporting:1;/* Device capable of reporting link active */
> >       unsigned int    no_vf_scan:1;           /* Don't scan for VFs after IOV enablement */
> > +     unsigned int    detached_vf:1;          /* VF without local PF access */
>
> Is there too much implicit knowledge in defining a "detached VF"?  For
> example, why do we know that we can skip the portion of
> vfio_config_init() that copies the vendor and device IDs from the
> struct pci_dev into the virtual config space?  It's true on s390x, but
> I think that's because we know that firmware emulates those registers
> for us.
>
> We also skip the INTx pin register sanity checking.  Do we do
> that because we haven't installed the broken device into an s390x
> system?  Because we know firmware manages that for us too?  Or simply
> because s390x doesn't support INTx anyway, and therefore it's another
> architecture implicit decision?

Agreed. Any hacks we put in for normal VFs are going to be needed for
the passed-though VF case. Only applying the memory space enable
workaround doesn't make sense to me either.

> If detached_vf is really equivalent to is_virtfn for all cases that
> don't care about referencing physfn on the pci_dev, then we should
> probably have a macro to that effect.

A pci_is_virtfn() helper would be better than open coding both checks
everywhere. That said, it might be solving the wrong problem. The
union between ->physfn and ->sriov has always seemed like a footgun to
me so we might be better off switching the users who want a physfn to
a helper instead. i.e.

struct pci_dev *pci_get_vf_physfn(struct pci_dev *vf)
{
        if (!vf->is_virtfn)
                return NULL;

        return vf->physfn;
}

...

pf = pci_get_vf_physfn(vf)
if (pf)
    /* do pf things */

Then we can just use ->is_virtfn for the normal and detached cases.

Oliver
Niklas Schnelle Aug. 13, 2020, 7:54 a.m. UTC | #4
On 8/13/20 3:59 AM, Oliver O'Halloran wrote:
> On Thu, Aug 13, 2020 at 6:33 AM Alex Williamson
> <alex.williamson@redhat.com> wrote:
>>
>> On Wed, 12 Aug 2020 15:21:11 -0400
>> Matthew Rosato <mjrosato@linux.ibm.com> wrote:
>>
... snip ...
>>
>> Is there too much implicit knowledge in defining a "detached VF"?  For
>> example, why do we know that we can skip the portion of
>> vfio_config_init() that copies the vendor and device IDs from the
>> struct pci_dev into the virtual config space?  It's true on s390x, but
>> I think that's because we know that firmware emulates those registers
>> for us.
>>
>> We also skip the INTx pin register sanity checking.  Do we do
>> that because we haven't installed the broken device into an s390x
>> system?  Because we know firmware manages that for us too?  Or simply
>> because s390x doesn't support INTx anyway, and therefore it's another
>> architecture implicit decision?
> 
> Agreed. Any hacks we put in for normal VFs are going to be needed for
> the passed-though VF case. Only applying the memory space enable
> workaround doesn't make sense to me either.

We did actually have the detached_vf check in that if in
a previous patch version, turning on the INTx and quirk checks.
We decided to send a minimal version for the discussion.
That said I agree that this is currently too specific to our
case.

> 
>> If detached_vf is really equivalent to is_virtfn for all cases that
>> don't care about referencing physfn on the pci_dev, then we should
>> probably have a macro to that effect.

In my opinion it really is, that's why we initially tried to just
set pdev->is_virtfn leaving the physfn pointer NULL for these
detached VFs. 
But as you said that gets uncomfortable because of the union and existing code
assuming that pdev->is_virtfn always means physfn is set.

I think the underlying problem here is, that the current use
of pdev->is_virtfn conflates the two reasons we need to know whether
something is a VF:

1. For dealing with the differences in how a VF presents itself vs a PF
2. For knowing whether the physfn/sriov union is a pointer to the parent PF

If we could untangle this in a sane way I think that would
be the best long term solution.

> 
> A pci_is_virtfn() helper would be better than open coding both checks
> everywhere. That said, it might be solving the wrong problem. The
> union between ->physfn and ->sriov has always seemed like a footgun to
> me so we might be better off switching the users who want a physfn to
> a helper instead. i.e.
> 
> struct pci_dev *pci_get_vf_physfn(struct pci_dev *vf)
> {
>         if (!vf->is_virtfn)
>                 return NULL;
> 
>         return vf->physfn;
> }

Hmm, this is almost exactly  include/linux/pci.h:pci_physfn()
except that returns the argument pdev itself when is_virtfn
is not set.

> 
> ...
> 
> pf = pci_get_vf_physfn(vf)
> if (pf)
>     /* do pf things */
> 
> Then we can just use ->is_virtfn for the normal and detached cases.

I'm asssuming you mean by setting vf->is_virtfn = 1; vf->physfn = NULL
for the detached case? I think that actually also works with the existing
pci_physfn() helper but it requires handling a returned NULL at
all callsites.

> 
> Oliver
>
Niklas Schnelle Aug. 13, 2020, 9 a.m. UTC | #5
On 8/13/20 3:55 AM, Oliver O'Halloran wrote:
> On Thu, Aug 13, 2020 at 5:21 AM Matthew Rosato <mjrosato@linux.ibm.com> wrote:
>>
>> s390x has the notion of providing VFs to the kernel in a manner
>> where the associated PF is inaccessible other than via firmware.
>> These are not treated as typical VFs and access to them is emulated
>> by underlying firmware which can still access the PF.  After
>> abafbc55 however these detached VFs were no longer able to work
>> with vfio-pci as the firmware does not provide emulation of the
>> PCI_COMMAND_MEMORY bit.  In this case, let's explicitly recognize
>> these detached VFs so that vfio-pci can allow memory access to
>> them again.
> 
> Hmm, cool. I think we have a similar feature on pseries so that's
> probably broken too.
> 
>> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
>> ---
>>  arch/s390/pci/pci.c                |  8 ++++++++
>>  drivers/vfio/pci/vfio_pci_config.c | 11 +++++++----
>>  include/linux/pci.h                |  1 +
>>  3 files changed, 16 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
>> index 3902c9f..04ac76d 100644
>> --- a/arch/s390/pci/pci.c
>> +++ b/arch/s390/pci/pci.c
>> @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
>>  {
>>         struct zpci_dev *zdev = to_zpci(pdev);
>>
>> +       /*
>> +        * If we have a VF on a non-multifunction bus, it must be a VF that is
>> +        * detached from its parent PF.  We rely on firmware emulation to
>> +        * provide underlying PF details.
>> +        */
>> +       if (zdev->vfn && !zdev->zbus->multifunction)
>> +               pdev->detached_vf = 1;
> 
> The enable hook seems like it's a bit too late for this sort of
> screwing around with the pci_dev. Anything in the setup path that
> looks at ->detached_vf would see it cleared while anything that looks
> after the device is enabled will see it set. Can this go into
> pcibios_add_device() or a fixup instead?
> 

This particular check could go into pcibios_add_device() yes.
We're also currently working on a slight rework of how
we establish the VF to parent PF linking including the sysfs
part of that. The latter sadly can only go after the sysfs
for the virtfn has been created and that only happens
after all fixups. We would like to do both together because
the latter sets pdev->is_virtfn which I think is closely related.

I was thinking of starting another discussion
about adding a hook that is executed just after the sysfs entries
for the PCI device are created but haven't yet.
That said pcibios_enable_device() is called before drivers
like vfio-pci are enabled and so as long as all uses of pdev->detached_vf
are in drivers it should be early enough. AFAIK almost everything
dealing with VFs before that is already skipped with pdev->no_vf_scan
though.
Oliver O'Halloran Aug. 13, 2020, 9:59 a.m. UTC | #6
On Thu, Aug 13, 2020 at 7:00 PM Niklas Schnelle <schnelle@linux.ibm.com> wrote:
>
>
> On 8/13/20 3:55 AM, Oliver O'Halloran wrote:
> > On Thu, Aug 13, 2020 at 5:21 AM Matthew Rosato <mjrosato@linux.ibm.com> wrote:
> >> *snip*
> >> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> >> index 3902c9f..04ac76d 100644
> >> --- a/arch/s390/pci/pci.c
> >> +++ b/arch/s390/pci/pci.c
> >> @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
> >>  {
> >>         struct zpci_dev *zdev = to_zpci(pdev);
> >>
> >> +       /*
> >> +        * If we have a VF on a non-multifunction bus, it must be a VF that is
> >> +        * detached from its parent PF.  We rely on firmware emulation to
> >> +        * provide underlying PF details.
> >> +        */
> >> +       if (zdev->vfn && !zdev->zbus->multifunction)
> >> +               pdev->detached_vf = 1;
> >
> > The enable hook seems like it's a bit too late for this sort of
> > screwing around with the pci_dev. Anything in the setup path that
> > looks at ->detached_vf would see it cleared while anything that looks
> > after the device is enabled will see it set. Can this go into
> > pcibios_add_device() or a fixup instead?
> >
>
> This particular check could go into pcibios_add_device() yes.
> We're also currently working on a slight rework of how
> we establish the VF to parent PF linking including the sysfs
> part of that. The latter sadly can only go after the sysfs
> for the virtfn has been created and that only happens
> after all fixups. We would like to do both together because
> the latter sets pdev->is_virtfn which I think is closely related.
>
> I was thinking of starting another discussion
> about adding a hook that is executed just after the sysfs entries
> for the PCI device are created but haven't yet.

if all you need is sysfs then pcibios_bus_add_device() or a bus
notifier should work

> That said pcibios_enable_device() is called before drivers
> like vfio-pci are enabled

Hmm, is that an s390 thing? I was under the impression that drivers
handled enabling the device rather than assuming the platform did it
for them. Granted it's usually one of the first things a driver does,
but there's still scope for surprising behaviour.

> and so as long as all uses of pdev->detached_vf
> are in drivers it should be early enough. AFAIK almost everything
> dealing with VFs before that is already skipped with pdev->no_vf_scan
> though.

I'm sure it works fine in your particular case. My main gripe is that
you're adding a flag in a generic structure so people reading the code
without that context may make assumptions about when it's valid to
use. The number of pcibios_* hooks we have means that working out when
and where something happens in the pci setup path usually involves
going on a ~magical journey~ through generic and arch specific code.
It's not *that* bad once you've worked out how it all fits together,
but it's still a pain. If we can initialise stuff before the pci_dev
is added to the bus it's usually for the better.

Oliver
Niklas Schnelle Aug. 13, 2020, 10:40 a.m. UTC | #7
On 8/13/20 11:59 AM, Oliver O'Halloran wrote:
> On Thu, Aug 13, 2020 at 7:00 PM Niklas Schnelle <schnelle@linux.ibm.com> wrote:
>>
>>
>> On 8/13/20 3:55 AM, Oliver O'Halloran wrote:
>>> On Thu, Aug 13, 2020 at 5:21 AM Matthew Rosato <mjrosato@linux.ibm.com> wrote:
>>>> *snip*
>>>> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
>>>> index 3902c9f..04ac76d 100644
>>>> --- a/arch/s390/pci/pci.c
>>>> +++ b/arch/s390/pci/pci.c
>>>> @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
>>>>  {
>>>>         struct zpci_dev *zdev = to_zpci(pdev);
>>>>
>>>> +       /*
>>>> +        * If we have a VF on a non-multifunction bus, it must be a VF that is
>>>> +        * detached from its parent PF.  We rely on firmware emulation to
>>>> +        * provide underlying PF details.
>>>> +        */
>>>> +       if (zdev->vfn && !zdev->zbus->multifunction)
>>>> +               pdev->detached_vf = 1;
>>>
>>> The enable hook seems like it's a bit too late for this sort of
>>> screwing around with the pci_dev. Anything in the setup path that
>>> looks at ->detached_vf would see it cleared while anything that looks
>>> after the device is enabled will see it set. Can this go into
>>> pcibios_add_device() or a fixup instead?
>>>
>>
>> This particular check could go into pcibios_add_device() yes.
>> We're also currently working on a slight rework of how
>> we establish the VF to parent PF linking including the sysfs
>> part of that. The latter sadly can only go after the sysfs
>> for the virtfn has been created and that only happens
>> after all fixups. We would like to do both together because
>> the latter sets pdev->is_virtfn which I think is closely related.
>>
>> I was thinking of starting another discussion
>> about adding a hook that is executed just after the sysfs entries
>> for the PCI device are created but haven't yet.
> 
> if all you need is sysfs then pcibios_bus_add_device() or a bus
> notifier should work

So this might go a bit off track but the problem is that
on s390 a VF can be disabled and reenabled with disable_slot()/enable_slot().
In this case pcibios_bus_add_device() is not called again but
the PF/VF link needs to be reestablished.

> 
>> That said pcibios_enable_device() is called before drivers
>> like vfio-pci are enabled
> 
> Hmm, is that an s390 thing? I was under the impression that drivers
> handled enabling the device rather than assuming the platform did it
> for them. Granted it's usually one of the first things a driver does,
> but there's still scope for surprising behaviour.

No you're absolutely right I formulated this wrong, pcibios_enable_device()
is called by the drivers but before they can really use the device.

But yes I'm not super happy with this either and I
agree for this patch series we should move the check to pcibios_add_device()
and thinking about it more I think I'll really have to find a better
place for our linking as well, pcibios_enable_device() does work
nicely in practice buy it indeed poses room for surprising behavior.

> 
... snip ...
Niklas Schnelle Aug. 13, 2020, 12:34 p.m. UTC | #8
On 8/13/20 12:40 PM, Niklas Schnelle wrote:
> 
> 
> On 8/13/20 11:59 AM, Oliver O'Halloran wrote:
>> On Thu, Aug 13, 2020 at 7:00 PM Niklas Schnelle <schnelle@linux.ibm.com> wrote:
>>>
>>>
>>> On 8/13/20 3:55 AM, Oliver O'Halloran wrote:
>>>> On Thu, Aug 13, 2020 at 5:21 AM Matthew Rosato <mjrosato@linux.ibm.com> wrote:
>>>>> *snip*
>>>>> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
>>>>> index 3902c9f..04ac76d 100644
>>>>> --- a/arch/s390/pci/pci.c
>>>>> +++ b/arch/s390/pci/pci.c
>>>>> @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
>>>>>  {
>>>>>         struct zpci_dev *zdev = to_zpci(pdev);
>>>>>
>>>>> +       /*
>>>>> +        * If we have a VF on a non-multifunction bus, it must be a VF that is
>>>>> +        * detached from its parent PF.  We rely on firmware emulation to
>>>>> +        * provide underlying PF details.
>>>>> +        */
>>>>> +       if (zdev->vfn && !zdev->zbus->multifunction)
>>>>> +               pdev->detached_vf = 1;
>>>>
>>>> The enable hook seems like it's a bit too late for this sort of
>>>> screwing around with the pci_dev. Anything in the setup path that
>>>> looks at ->detached_vf would see it cleared while anything that looks
>>>> after the device is enabled will see it set. Can this go into
>>>> pcibios_add_device() or a fixup instead?
>>>>
>>>
>>> This particular check could go into pcibios_add_device() yes.
>>> We're also currently working on a slight rework of how
>>> we establish the VF to parent PF linking including the sysfs
>>> part of that. The latter sadly can only go after the sysfs
>>> for the virtfn has been created and that only happens
>>> after all fixups. We would like to do both together because
>>> the latter sets pdev->is_virtfn which I think is closely related.
>>>
>>> I was thinking of starting another discussion
>>> about adding a hook that is executed just after the sysfs entries
>>> for the PCI device are created but haven't yet.
>>
>> if all you need is sysfs then pcibios_bus_add_device() or a bus
>> notifier should work
> 
> So this might go a bit off track but the problem is that
> on s390 a VF can be disabled and reenabled with disable_slot()/enable_slot().
> In this case pcibios_bus_add_device() is not called again but
> the PF/VF link needs to be reestablished.

Scratch that I must have made some stupid mistake last time I tried
this, with your suggestion I tried again and it works perfectly
moving the setup into pcibios_bus_add_device().
Thank you, this is actually much nicer!

> 
> ... snip ...
>
Matthew Rosato Aug. 13, 2020, 1:09 p.m. UTC | #9
On 8/12/20 4:32 PM, Alex Williamson wrote:
> On Wed, 12 Aug 2020 15:21:11 -0400
> Matthew Rosato <mjrosato@linux.ibm.com> wrote:
> 
>> s390x has the notion of providing VFs to the kernel in a manner
>> where the associated PF is inaccessible other than via firmware.
>> These are not treated as typical VFs and access to them is emulated
>> by underlying firmware which can still access the PF.  After
>> abafbc55 however these detached VFs were no longer able to work
>> with vfio-pci as the firmware does not provide emulation of the
>> PCI_COMMAND_MEMORY bit.  In this case, let's explicitly recognize
>> these detached VFs so that vfio-pci can allow memory access to
>> them again.
>>
> 
> Might as well include a fixes tag too.
> 
> Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory")
> 
> You might also extend the sha1 in the log to 12 chars as well, or
> replace it with a reference to the fixes tag.
> 
Sure.

>> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
>> ---
..snip..
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 8355306..23a6972 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -445,6 +445,7 @@ struct pci_dev {
>>   	unsigned int	is_probed:1;		/* Device probing in progress */
>>   	unsigned int	link_active_reporting:1;/* Device capable of reporting link active */
>>   	unsigned int	no_vf_scan:1;		/* Don't scan for VFs after IOV enablement */
>> +	unsigned int	detached_vf:1;		/* VF without local PF access */
> 
> Is there too much implicit knowledge in defining a "detached VF"?  For
> example, why do we know that we can skip the portion of
> vfio_config_init() that copies the vendor and device IDs from the
> struct pci_dev into the virtual config space?  It's true on s390x, but
> I think that's because we know that firmware emulates those registers
> for us.  We also skip the INTx pin register sanity checking.  Do we do
> that because we haven't installed the broken device into an s390x
> system?  Because we know firmware manages that for us too?  Or simply
> because s390x doesn't support INTx anyway, and therefore it's another
> architecture implicit decision?

That's a fair point.  This was also discussed (overnight for me) in 
another thread that this patch is very s390-specific.  It doesn't have 
to be, we could also emulate these additional pieces to make things more 
general-purpose here.

> 
> If detached_vf is really equivalent to is_virtfn for all cases that
> don't care about referencing physfn on the pci_dev, then we should
> probably have a macro to that effect.  Otherwise, if we're just trying
> to describe that the memory bit of the command register is
> unimplemented but always enabled, like a VF, should we specifically
> describe that attribute instead?  If so, should we instead do that with
> pci_dev_flags_t?  Thanks,

Well, that's the particular issue that got us looking at this but I'm 
not so sure we wouldn't find further oddities later, hence the desire 
for a more general-purpose bit.

> 
> Alex
> 
>>   	pci_dev_flags_t dev_flags;
>>   	atomic_t	enable_cnt;	/* pci_enable_device has been called */
>>   
>
Matthew Rosato Aug. 13, 2020, 1:11 p.m. UTC | #10
On 8/13/20 8:34 AM, Niklas Schnelle wrote:
> 
> 
> On 8/13/20 12:40 PM, Niklas Schnelle wrote:
>>
>>
>> On 8/13/20 11:59 AM, Oliver O'Halloran wrote:
>>> On Thu, Aug 13, 2020 at 7:00 PM Niklas Schnelle <schnelle@linux.ibm.com> wrote:
>>>>
>>>>
>>>> On 8/13/20 3:55 AM, Oliver O'Halloran wrote:
>>>>> On Thu, Aug 13, 2020 at 5:21 AM Matthew Rosato <mjrosato@linux.ibm.com> wrote:
>>>>>> *snip*
>>>>>> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
>>>>>> index 3902c9f..04ac76d 100644
>>>>>> --- a/arch/s390/pci/pci.c
>>>>>> +++ b/arch/s390/pci/pci.c
>>>>>> @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
>>>>>>   {
>>>>>>          struct zpci_dev *zdev = to_zpci(pdev);
>>>>>>
>>>>>> +       /*
>>>>>> +        * If we have a VF on a non-multifunction bus, it must be a VF that is
>>>>>> +        * detached from its parent PF.  We rely on firmware emulation to
>>>>>> +        * provide underlying PF details.
>>>>>> +        */
>>>>>> +       if (zdev->vfn && !zdev->zbus->multifunction)
>>>>>> +               pdev->detached_vf = 1;
>>>>>
>>>>> The enable hook seems like it's a bit too late for this sort of
>>>>> screwing around with the pci_dev. Anything in the setup path that
>>>>> looks at ->detached_vf would see it cleared while anything that looks
>>>>> after the device is enabled will see it set. Can this go into
>>>>> pcibios_add_device() or a fixup instead?
>>>>>
>>>>
>>>> This particular check could go into pcibios_add_device() yes.
>>>> We're also currently working on a slight rework of how
>>>> we establish the VF to parent PF linking including the sysfs
>>>> part of that. The latter sadly can only go after the sysfs
>>>> for the virtfn has been created and that only happens
>>>> after all fixups. We would like to do both together because
>>>> the latter sets pdev->is_virtfn which I think is closely related.
>>>>
>>>> I was thinking of starting another discussion
>>>> about adding a hook that is executed just after the sysfs entries
>>>> for the PCI device are created but haven't yet.
>>>
>>> if all you need is sysfs then pcibios_bus_add_device() or a bus
>>> notifier should work
>>
>> So this might go a bit off track but the problem is that
>> on s390 a VF can be disabled and reenabled with disable_slot()/enable_slot().
>> In this case pcibios_bus_add_device() is not called again but
>> the PF/VF link needs to be reestablished.
> 
> Scratch that I must have made some stupid mistake last time I tried
> this, with your suggestion I tried again and it works perfectly
> moving the setup into pcibios_bus_add_device().
> Thank you, this is actually much nicer!
> 

OK, and I can likewise relocate the setting of detached_vf to 
pcibios_bus_add_device().
Niklas Schnelle Aug. 13, 2020, 1:28 p.m. UTC | #11
On 8/13/20 3:11 PM, Matthew Rosato wrote:
> On 8/13/20 8:34 AM, Niklas Schnelle wrote:
>>
>>
>> On 8/13/20 12:40 PM, Niklas Schnelle wrote:
>>>
>>>
>>> On 8/13/20 11:59 AM, Oliver O'Halloran wrote:
>>>> On Thu, Aug 13, 2020 at 7:00 PM Niklas Schnelle <schnelle@linux.ibm.com> wrote:
>>>>>
>>>>>
>>>>> On 8/13/20 3:55 AM, Oliver O'Halloran wrote:
>>>>>> On Thu, Aug 13, 2020 at 5:21 AM Matthew Rosato <mjrosato@linux.ibm.com> wrote:
>>>>>>> *snip*
>>>>>>> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
>>>>>>> index 3902c9f..04ac76d 100644
>>>>>>> --- a/arch/s390/pci/pci.c
>>>>>>> +++ b/arch/s390/pci/pci.c
>>>>>>> @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
>>>>>>>   {
>>>>>>>          struct zpci_dev *zdev = to_zpci(pdev);
>>>>>>>
>>>>>>> +       /*
>>>>>>> +        * If we have a VF on a non-multifunction bus, it must be a VF that is
>>>>>>> +        * detached from its parent PF.  We rely on firmware emulation to
>>>>>>> +        * provide underlying PF details.
>>>>>>> +        */
>>>>>>> +       if (zdev->vfn && !zdev->zbus->multifunction)
>>>>>>> +               pdev->detached_vf = 1;
>>>>>>
>>>>>> The enable hook seems like it's a bit too late for this sort of
>>>>>> screwing around with the pci_dev. Anything in the setup path that
>>>>>> looks at ->detached_vf would see it cleared while anything that looks
>>>>>> after the device is enabled will see it set. Can this go into
>>>>>> pcibios_add_device() or a fixup instead?
>>>>>>
>>>>>
>>>>> This particular check could go into pcibios_add_device() yes.
>>>>> We're also currently working on a slight rework of how
>>>>> we establish the VF to parent PF linking including the sysfs
>>>>> part of that. The latter sadly can only go after the sysfs
>>>>> for the virtfn has been created and that only happens
>>>>> after all fixups. We would like to do both together because
>>>>> the latter sets pdev->is_virtfn which I think is closely related.
>>>>>
>>>>> I was thinking of starting another discussion
>>>>> about adding a hook that is executed just after the sysfs entries
>>>>> for the PCI device are created but haven't yet.
>>>>
>>>> if all you need is sysfs then pcibios_bus_add_device() or a bus
>>>> notifier should work
>>>
>>> So this might go a bit off track but the problem is that
>>> on s390 a VF can be disabled and reenabled with disable_slot()/enable_slot().
>>> In this case pcibios_bus_add_device() is not called again but
>>> the PF/VF link needs to be reestablished.
>>
>> Scratch that I must have made some stupid mistake last time I tried
>> this, with your suggestion I tried again and it works perfectly
>> moving the setup into pcibios_bus_add_device().
>> Thank you, this is actually much nicer!
>>
> 
> OK, and I can likewise relocate the setting of detached_vf to pcibios_bus_add_device().
> 
Yes and I would suggest we add it in arch/s390/pci_bus.c just
after the the setup_virtfn stuff, then that can stay static
and the fix minimal.
I'll send a new version of my patches internally later, still
running it on the different configurations.
diff mbox series

Patch

diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 3902c9f..04ac76d 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -581,6 +581,14 @@  int pcibios_enable_device(struct pci_dev *pdev, int mask)
 {
 	struct zpci_dev *zdev = to_zpci(pdev);
 
+	/*
+	 * If we have a VF on a non-multifunction bus, it must be a VF that is
+	 * detached from its parent PF.  We rely on firmware emulation to
+	 * provide underlying PF details.
+	 */
+	if (zdev->vfn && !zdev->zbus->multifunction)
+		pdev->detached_vf = 1;
+
 	zpci_debug_init_device(zdev, dev_name(&pdev->dev));
 	zpci_fmb_enable_device(zdev);
 
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index d98843f..ee45216 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -406,7 +406,8 @@  bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev)
 	 * PF SR-IOV capability, there's therefore no need to trigger
 	 * faults based on the virtual value.
 	 */
-	return pdev->is_virtfn || (cmd & PCI_COMMAND_MEMORY);
+	return pdev->is_virtfn || pdev->detached_vf ||
+	       (cmd & PCI_COMMAND_MEMORY);
 }
 
 /*
@@ -420,7 +421,7 @@  static void vfio_bar_restore(struct vfio_pci_device *vdev)
 	u16 cmd;
 	int i;
 
-	if (pdev->is_virtfn)
+	if (pdev->is_virtfn || pdev->detached_vf)
 		return;
 
 	pci_info(pdev, "%s: reset recovery - restoring BARs\n", __func__);
@@ -521,7 +522,8 @@  static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
 	count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
 
 	/* Mask in virtual memory enable for SR-IOV devices */
-	if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) {
+	if ((offset == PCI_COMMAND) &&
+	    (vdev->pdev->is_virtfn || vdev->pdev->detached_vf)) {
 		u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
 		u32 tmp_val = le32_to_cpu(*val);
 
@@ -1734,7 +1736,8 @@  int vfio_config_init(struct vfio_pci_device *vdev)
 				 vconfig[PCI_INTERRUPT_PIN]);
 
 		vconfig[PCI_INTERRUPT_PIN] = 0; /* Gratuitous for good VFs */
-
+	}
+	if (pdev->is_virtfn || pdev->detached_vf) {
 		/*
 		 * VFs do no implement the memory enable bit of the COMMAND
 		 * register therefore we'll not have it set in our initial
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8355306..23a6972 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -445,6 +445,7 @@  struct pci_dev {
 	unsigned int	is_probed:1;		/* Device probing in progress */
 	unsigned int	link_active_reporting:1;/* Device capable of reporting link active */
 	unsigned int	no_vf_scan:1;		/* Don't scan for VFs after IOV enablement */
+	unsigned int	detached_vf:1;		/* VF without local PF access */
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */