diff mbox series

[v5,03/14] vpci: move lock outside of struct vpci

Message ID 20211125110251.2877218-4-andr2000@gmail.com (mailing list archive)
State New, archived
Headers show
Series PCI devices passthrough on Arm, part 3 | expand

Commit Message

Oleksandr Andrushchenko Nov. 25, 2021, 11:02 a.m. UTC
From: Roger Pau Monne <roger.pau@citrix.com>

This way the lock can be used to check whether vpci is present, and
removal can be performed while holding the lock, in order to make
sure there are no accesses to the contents of the vpci struct.
Previously removal could race with vpci_read for example, since the
lock was dropped prior to freeing pdev->vpci.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
---
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
---
New in v5 of this series: this is an updated version of the patch published at
https://lore.kernel.org/xen-devel/20180717094830.54806-2-roger.pau@citrix.com/

Changes since v2:
 - fixed pdev->vpci = xzalloc(struct vpci); under spin_lock (Jan)
Changes since v1:
 - Assert that vpci_lock is locked in vpci_remove_device_locked.
 - Remove double newline.
 - Shrink critical section in vpci_{read/write}.
---
 tools/tests/vpci/emul.h       |  5 ++-
 tools/tests/vpci/main.c       |  4 +--
 xen/arch/x86/hvm/vmsi.c       |  8 ++---
 xen/drivers/passthrough/pci.c |  1 +
 xen/drivers/vpci/header.c     | 21 +++++++----
 xen/drivers/vpci/msi.c        | 11 ++++--
 xen/drivers/vpci/msix.c       |  8 ++---
 xen/drivers/vpci/vpci.c       | 68 +++++++++++++++++++++++------------
 xen/include/xen/pci.h         |  1 +
 xen/include/xen/vpci.h        |  5 +--
 10 files changed, 85 insertions(+), 47 deletions(-)

Comments

Roger Pau Monné Jan. 11, 2022, 3:17 p.m. UTC | #1
On Thu, Nov 25, 2021 at 01:02:40PM +0200, Oleksandr Andrushchenko wrote:
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 657697fe3406..ceaac4516ff8 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -35,12 +35,10 @@ extern vpci_register_init_t *const __start_vpci_array[];
>  extern vpci_register_init_t *const __end_vpci_array[];
>  #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
>  
> -void vpci_remove_device(struct pci_dev *pdev)
> +static void vpci_remove_device_handlers_locked(struct pci_dev *pdev)
>  {
> -    if ( !has_vpci(pdev->domain) )
> -        return;
> +    ASSERT(spin_is_locked(&pdev->vpci_lock));
>  
> -    spin_lock(&pdev->vpci->lock);
>      while ( !list_empty(&pdev->vpci->handlers) )
>      {
>          struct vpci_register *r = list_first_entry(&pdev->vpci->handlers,
> @@ -50,15 +48,33 @@ void vpci_remove_device(struct pci_dev *pdev)
>          list_del(&r->node);
>          xfree(r);
>      }
> -    spin_unlock(&pdev->vpci->lock);
> +}
> +
> +void vpci_remove_device_locked(struct pci_dev *pdev)

I think this could be static instead, as it's only used by
vpci_remove_device and vpci_add_handlers which are local to the
file.

Thanks, Roger.
Jan Beulich Jan. 12, 2022, 2:42 p.m. UTC | #2
On 11.01.2022 16:17, Roger Pau Monné wrote:
> On Thu, Nov 25, 2021 at 01:02:40PM +0200, Oleksandr Andrushchenko wrote:
>> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
>> index 657697fe3406..ceaac4516ff8 100644
>> --- a/xen/drivers/vpci/vpci.c
>> +++ b/xen/drivers/vpci/vpci.c
>> @@ -35,12 +35,10 @@ extern vpci_register_init_t *const __start_vpci_array[];
>>  extern vpci_register_init_t *const __end_vpci_array[];
>>  #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
>>  
>> -void vpci_remove_device(struct pci_dev *pdev)
>> +static void vpci_remove_device_handlers_locked(struct pci_dev *pdev)
>>  {
>> -    if ( !has_vpci(pdev->domain) )
>> -        return;
>> +    ASSERT(spin_is_locked(&pdev->vpci_lock));
>>  
>> -    spin_lock(&pdev->vpci->lock);
>>      while ( !list_empty(&pdev->vpci->handlers) )
>>      {
>>          struct vpci_register *r = list_first_entry(&pdev->vpci->handlers,
>> @@ -50,15 +48,33 @@ void vpci_remove_device(struct pci_dev *pdev)
>>          list_del(&r->node);
>>          xfree(r);
>>      }
>> -    spin_unlock(&pdev->vpci->lock);
>> +}
>> +
>> +void vpci_remove_device_locked(struct pci_dev *pdev)
> 
> I think this could be static instead, as it's only used by
> vpci_remove_device and vpci_add_handlers which are local to the
> file.

Does the splitting out of vpci_remove_device_handlers_locked() belong in
this patch in the first place? There's no second caller being added, so
this looks to be an orthogonal adjustment.

Jan
Jan Beulich Jan. 12, 2022, 2:57 p.m. UTC | #3
On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
> @@ -68,12 +84,13 @@ int vpci_add_handlers(struct pci_dev *pdev)
>      /* We should not get here twice for the same device. */
>      ASSERT(!pdev->vpci);
>  
> -    pdev->vpci = xzalloc(struct vpci);
> -    if ( !pdev->vpci )
> +    vpci = xzalloc(struct vpci);
> +    if ( !vpci )
>          return -ENOMEM;
>  
> +    spin_lock(&pdev->vpci_lock);
> +    pdev->vpci = vpci;
>      INIT_LIST_HEAD(&pdev->vpci->handlers);
> -    spin_lock_init(&pdev->vpci->lock);

INIT_LIST_HEAD() can occur ahead of taking the lock, and can also act
on &vpci->handlers rather than &pdev->vpci->handlers.

>      for ( i = 0; i < NUM_VPCI_INIT; i++ )
>      {
> @@ -83,7 +100,8 @@ int vpci_add_handlers(struct pci_dev *pdev)
>      }

This loop wants to live in the locked region because you need to install
vpci into pdev->vpci up front, afaict. I wonder whether that couldn't
be relaxed, but perhaps that's an improvement that can be thought about
later.

The main reason I'm looking at this closely is because from the patch
title I didn't expect new locking regions to be introduced right here;
instead I did expect strictly a mechanical conversion.

> @@ -152,8 +170,6 @@ int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
>      r->offset = offset;
>      r->private = data;
>  
> -    spin_lock(&vpci->lock);

From the description I can't deduce why this lock is fine to go away
now, i.e. that all call sites have the lock now acquire earlier.
Therefore I'd expect at least an assertion to be left here ...

> @@ -183,7 +197,6 @@ int vpci_remove_register(struct vpci *vpci, unsigned int offset,
>      const struct vpci_register r = { .offset = offset, .size = size };
>      struct vpci_register *rm;
>  
> -    spin_lock(&vpci->lock);

... and here.

> @@ -370,6 +386,7 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
>              break;
>          ASSERT(data_offset < size);
>      }
> +    spin_unlock(&pdev->vpci_lock);
>  
>      if ( data_offset < size )
>      {
> @@ -379,7 +396,6 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
>  
>          data = merge_result(data, tmp_data, size - data_offset, data_offset);
>      }
> -    spin_unlock(&pdev->vpci->lock);
>  
>      return data & (0xffffffff >> (32 - 8 * size));
>  }

Here and ...

> @@ -475,13 +498,12 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>              break;
>          ASSERT(data_offset < size);
>      }
> +    spin_unlock(&pdev->vpci_lock);
>  
>      if ( data_offset < size )
>          /* Tailing gap, write the remaining. */
>          vpci_write_hw(sbdf, reg + data_offset, size - data_offset,
>                        data >> (data_offset * 8));
> -
> -    spin_unlock(&pdev->vpci->lock);
>  }

... even more so here I'm not sure of the correctness of the moving
you do: While pdev->vpci indeed doesn't get accessed, I wonder
whether there wasn't an intention to avoid racing calls to
vpci_{read,write}_hw() this way. In any event I think such movement
would need justification in the description.

Jan
Roger Pau Monné Jan. 12, 2022, 3:42 p.m. UTC | #4
On Wed, Jan 12, 2022 at 03:57:36PM +0100, Jan Beulich wrote:
> On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
> > @@ -379,7 +396,6 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
> >  
> >          data = merge_result(data, tmp_data, size - data_offset, data_offset);
> >      }
> > -    spin_unlock(&pdev->vpci->lock);
> >  
> >      return data & (0xffffffff >> (32 - 8 * size));
> >  }
> 
> Here and ...
> 
> > @@ -475,13 +498,12 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> >              break;
> >          ASSERT(data_offset < size);
> >      }
> > +    spin_unlock(&pdev->vpci_lock);
> >  
> >      if ( data_offset < size )
> >          /* Tailing gap, write the remaining. */
> >          vpci_write_hw(sbdf, reg + data_offset, size - data_offset,
> >                        data >> (data_offset * 8));
> > -
> > -    spin_unlock(&pdev->vpci->lock);
> >  }
> 
> ... even more so here I'm not sure of the correctness of the moving
> you do: While pdev->vpci indeed doesn't get accessed, I wonder
> whether there wasn't an intention to avoid racing calls to
> vpci_{read,write}_hw() this way. In any event I think such movement
> would need justification in the description.

I agree about the need for justification in the commit message, or
even better this being split into a pre-patch, as it's not related to
the lock switching done here.

I do think this is fine however, as racing calls to
vpci_{read,write}_hw() shouldn't be a problem. Those are just wrappers
around pci_conf_{read,write} functions, and the required locking (in
case of using the IO ports) is already taken care in
pci_conf_{read,write}.

Thanks, Roger.
Jan Beulich Jan. 12, 2022, 3:52 p.m. UTC | #5
On 12.01.2022 16:42, Roger Pau Monné wrote:
> On Wed, Jan 12, 2022 at 03:57:36PM +0100, Jan Beulich wrote:
>> On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
>>> @@ -379,7 +396,6 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
>>>  
>>>          data = merge_result(data, tmp_data, size - data_offset, data_offset);
>>>      }
>>> -    spin_unlock(&pdev->vpci->lock);
>>>  
>>>      return data & (0xffffffff >> (32 - 8 * size));
>>>  }
>>
>> Here and ...
>>
>>> @@ -475,13 +498,12 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>>>              break;
>>>          ASSERT(data_offset < size);
>>>      }
>>> +    spin_unlock(&pdev->vpci_lock);
>>>  
>>>      if ( data_offset < size )
>>>          /* Tailing gap, write the remaining. */
>>>          vpci_write_hw(sbdf, reg + data_offset, size - data_offset,
>>>                        data >> (data_offset * 8));
>>> -
>>> -    spin_unlock(&pdev->vpci->lock);
>>>  }
>>
>> ... even more so here I'm not sure of the correctness of the moving
>> you do: While pdev->vpci indeed doesn't get accessed, I wonder
>> whether there wasn't an intention to avoid racing calls to
>> vpci_{read,write}_hw() this way. In any event I think such movement
>> would need justification in the description.
> 
> I agree about the need for justification in the commit message, or
> even better this being split into a pre-patch, as it's not related to
> the lock switching done here.
> 
> I do think this is fine however, as racing calls to
> vpci_{read,write}_hw() shouldn't be a problem. Those are just wrappers
> around pci_conf_{read,write} functions, and the required locking (in
> case of using the IO ports) is already taken care in
> pci_conf_{read,write}.

IOW you consider it acceptable for a guest (really: Dom0) read racing
a write to read back only part of what was written (so far)? I would
think individual multi-byte reads and writes should appear atomic to
the guest.

Jan
Roger Pau Monné Jan. 13, 2022, 8:58 a.m. UTC | #6
On Wed, Jan 12, 2022 at 04:52:51PM +0100, Jan Beulich wrote:
> On 12.01.2022 16:42, Roger Pau Monné wrote:
> > On Wed, Jan 12, 2022 at 03:57:36PM +0100, Jan Beulich wrote:
> >> On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
> >>> @@ -379,7 +396,6 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
> >>>  
> >>>          data = merge_result(data, tmp_data, size - data_offset, data_offset);
> >>>      }
> >>> -    spin_unlock(&pdev->vpci->lock);
> >>>  
> >>>      return data & (0xffffffff >> (32 - 8 * size));
> >>>  }
> >>
> >> Here and ...
> >>
> >>> @@ -475,13 +498,12 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> >>>              break;
> >>>          ASSERT(data_offset < size);
> >>>      }
> >>> +    spin_unlock(&pdev->vpci_lock);
> >>>  
> >>>      if ( data_offset < size )
> >>>          /* Tailing gap, write the remaining. */
> >>>          vpci_write_hw(sbdf, reg + data_offset, size - data_offset,
> >>>                        data >> (data_offset * 8));
> >>> -
> >>> -    spin_unlock(&pdev->vpci->lock);
> >>>  }
> >>
> >> ... even more so here I'm not sure of the correctness of the moving
> >> you do: While pdev->vpci indeed doesn't get accessed, I wonder
> >> whether there wasn't an intention to avoid racing calls to
> >> vpci_{read,write}_hw() this way. In any event I think such movement
> >> would need justification in the description.
> > 
> > I agree about the need for justification in the commit message, or
> > even better this being split into a pre-patch, as it's not related to
> > the lock switching done here.
> > 
> > I do think this is fine however, as racing calls to
> > vpci_{read,write}_hw() shouldn't be a problem. Those are just wrappers
> > around pci_conf_{read,write} functions, and the required locking (in
> > case of using the IO ports) is already taken care in
> > pci_conf_{read,write}.
> 
> IOW you consider it acceptable for a guest (really: Dom0) read racing
> a write to read back only part of what was written (so far)? I would
> think individual multi-byte reads and writes should appear atomic to
> the guest.

We split 64bit writes into two 32bit ones without taking the lock for
the whole duration of the access, so it's already possible to see a
partially updated state as a result of a 64bit write.

I'm going over the PCI(e) spec but I don't seem to find anything about
whether the ECAM is allowed to split memory transactions into multiple
Configuration Requests, and whether those could then interleave with
requests from a different CPU.

Thanks, Roger.
Oleksandr Andrushchenko Jan. 26, 2022, 8:40 a.m. UTC | #7
Hello, Roger, Jan!

On 12.01.22 16:42, Jan Beulich wrote:
> On 11.01.2022 16:17, Roger Pau Monné wrote:
>> On Thu, Nov 25, 2021 at 01:02:40PM +0200, Oleksandr Andrushchenko wrote:
>>> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
>>> index 657697fe3406..ceaac4516ff8 100644
>>> --- a/xen/drivers/vpci/vpci.c
>>> +++ b/xen/drivers/vpci/vpci.c
>>> @@ -35,12 +35,10 @@ extern vpci_register_init_t *const __start_vpci_array[];
>>>   extern vpci_register_init_t *const __end_vpci_array[];
>>>   #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
>>>   
>>> -void vpci_remove_device(struct pci_dev *pdev)
>>> +static void vpci_remove_device_handlers_locked(struct pci_dev *pdev)
>>>   {
>>> -    if ( !has_vpci(pdev->domain) )
>>> -        return;
>>> +    ASSERT(spin_is_locked(&pdev->vpci_lock));
>>>   
>>> -    spin_lock(&pdev->vpci->lock);
>>>       while ( !list_empty(&pdev->vpci->handlers) )
>>>       {
>>>           struct vpci_register *r = list_first_entry(&pdev->vpci->handlers,
>>> @@ -50,15 +48,33 @@ void vpci_remove_device(struct pci_dev *pdev)
>>>           list_del(&r->node);
>>>           xfree(r);
>>>       }
>>> -    spin_unlock(&pdev->vpci->lock);
>>> +}
>>> +
>>> +void vpci_remove_device_locked(struct pci_dev *pdev)
>> I think this could be static instead, as it's only used by
>> vpci_remove_device and vpci_add_handlers which are local to the
>> file.
This is going to be used outside later on while processing pending mappings,
so I think it is not worth it defining it static here and then removing the static
key word later on: please see [PATCH v5 04/14] vpci: cancel pending map/unmap on vpci removal [1]
> Does the splitting out of vpci_remove_device_handlers_locked() belong in
> this patch in the first place? There's no second caller being added, so
> this looks to be an orthogonal adjustment.
I think of it as a preparation for the upcoming code: although the reason for the
change might not be immediately seen in this patch it is still in line with what
happens next.
So, I would prefer to keep the change as is: anyways the whole series should probably
be committed as a single piece of work, so it won't look inconsistent then

Thank you,
Oleksandr
>
> Jan
>

[1] https://patchwork.kernel.org/project/xen-devel/patch/20211125110251.2877218-5-andr2000@gmail.com/
Roger Pau Monné Jan. 26, 2022, 11:13 a.m. UTC | #8
On Wed, Jan 26, 2022 at 08:40:09AM +0000, Oleksandr Andrushchenko wrote:
> Hello, Roger, Jan!
> 
> On 12.01.22 16:42, Jan Beulich wrote:
> > On 11.01.2022 16:17, Roger Pau Monné wrote:
> >> On Thu, Nov 25, 2021 at 01:02:40PM +0200, Oleksandr Andrushchenko wrote:
> >>> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> >>> index 657697fe3406..ceaac4516ff8 100644
> >>> --- a/xen/drivers/vpci/vpci.c
> >>> +++ b/xen/drivers/vpci/vpci.c
> >>> @@ -35,12 +35,10 @@ extern vpci_register_init_t *const __start_vpci_array[];
> >>>   extern vpci_register_init_t *const __end_vpci_array[];
> >>>   #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
> >>>   
> >>> -void vpci_remove_device(struct pci_dev *pdev)
> >>> +static void vpci_remove_device_handlers_locked(struct pci_dev *pdev)
> >>>   {
> >>> -    if ( !has_vpci(pdev->domain) )
> >>> -        return;
> >>> +    ASSERT(spin_is_locked(&pdev->vpci_lock));
> >>>   
> >>> -    spin_lock(&pdev->vpci->lock);
> >>>       while ( !list_empty(&pdev->vpci->handlers) )
> >>>       {
> >>>           struct vpci_register *r = list_first_entry(&pdev->vpci->handlers,
> >>> @@ -50,15 +48,33 @@ void vpci_remove_device(struct pci_dev *pdev)
> >>>           list_del(&r->node);
> >>>           xfree(r);
> >>>       }
> >>> -    spin_unlock(&pdev->vpci->lock);
> >>> +}
> >>> +
> >>> +void vpci_remove_device_locked(struct pci_dev *pdev)
> >> I think this could be static instead, as it's only used by
> >> vpci_remove_device and vpci_add_handlers which are local to the
> >> file.
> This is going to be used outside later on while processing pending mappings,
> so I think it is not worth it defining it static here and then removing the static
> key word later on: please see [PATCH v5 04/14] vpci: cancel pending map/unmap on vpci removal [1]

I have some comments there also, which might change the approach
you are using.

> > Does the splitting out of vpci_remove_device_handlers_locked() belong in
> > this patch in the first place? There's no second caller being added, so
> > this looks to be an orthogonal adjustment.
> I think of it as a preparation for the upcoming code: although the reason for the
> change might not be immediately seen in this patch it is still in line with what
> happens next.

Right - it's generally best if the change is done together as the new
callers are added. Otherwise it's hard to understand why certain changes
are made, and you will likely get asked the same question on next
rounds.

It's also possible that the code that requires this is changed in
further iterations so there's no longer a need for the splitting.

Thanks, Roger.
Oleksandr Andrushchenko Jan. 28, 2022, 2:12 p.m. UTC | #9
Hi, Jan!

On 12.01.22 16:57, Jan Beulich wrote:
> On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
>> @@ -68,12 +84,13 @@ int vpci_add_handlers(struct pci_dev *pdev)
>>       /* We should not get here twice for the same device. */
>>       ASSERT(!pdev->vpci);
>>   
>> -    pdev->vpci = xzalloc(struct vpci);
>> -    if ( !pdev->vpci )
>> +    vpci = xzalloc(struct vpci);
>> +    if ( !vpci )
>>           return -ENOMEM;
>>   
>> +    spin_lock(&pdev->vpci_lock);
>> +    pdev->vpci = vpci;
>>       INIT_LIST_HEAD(&pdev->vpci->handlers);
>> -    spin_lock_init(&pdev->vpci->lock);
> INIT_LIST_HEAD() can occur ahead of taking the lock, and can also act
> on &vpci->handlers rather than &pdev->vpci->handlers.
Yes, I will move it, good catch
>>       for ( i = 0; i < NUM_VPCI_INIT; i++ )
>>       {
>> @@ -83,7 +100,8 @@ int vpci_add_handlers(struct pci_dev *pdev)
>>       }
> This loop wants to live in the locked region because you need to install
> vpci into pdev->vpci up front, afaict. I wonder whether that couldn't
> be relaxed, but perhaps that's an improvement that can be thought about
> later.
Ok, so I'll leave it as is
>
> The main reason I'm looking at this closely is because from the patch
> title I didn't expect new locking regions to be introduced right here;
> instead I did expect strictly a mechanical conversion.
>
>> @@ -152,8 +170,6 @@ int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
>>       r->offset = offset;
>>       r->private = data;
>>   
>> -    spin_lock(&vpci->lock);
>  From the description I can't deduce why this lock is fine to go away
> now, i.e. that all call sites have the lock now acquire earlier.
> Therefore I'd expect at least an assertion to be left here ...
>
>> @@ -183,7 +197,6 @@ int vpci_remove_register(struct vpci *vpci, unsigned int offset,
>>       const struct vpci_register r = { .offset = offset, .size = size };
>>       struct vpci_register *rm;
>>   
>> -    spin_lock(&vpci->lock);
> ... and here.
Previously the lock lived in struct vpci and now it lives in struct pci_dev which
is not visible here, so:
1. we cannot take that lock here and do expect for it to be acquired outside
2. we cannot add an ASSERT here as we would need
ASSERT(spin_is_locked(&pdev->vpci_lock));
and pdev is not here
All the callers of the vpci_{add|remove}_register are REGISTER_VPCI_INIT
functions which are called with &pdev->vpci_lock held.

So, while I agree that it would be indeed a good check with ASSERT here,
but adding an additional argument to the respective functions just for that
might not be a good idea IMO

I will describe this lock removal in the commit message

Thank you,
Oleksandr
Oleksandr Andrushchenko Jan. 28, 2022, 2:15 p.m. UTC | #10
Hi, Roger, Jan!

On 13.01.22 10:58, Roger Pau Monné wrote:
> On Wed, Jan 12, 2022 at 04:52:51PM +0100, Jan Beulich wrote:
>> On 12.01.2022 16:42, Roger Pau Monné wrote:
>>> On Wed, Jan 12, 2022 at 03:57:36PM +0100, Jan Beulich wrote:
>>>> On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
>>>>> @@ -379,7 +396,6 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
>>>>>   
>>>>>           data = merge_result(data, tmp_data, size - data_offset, data_offset);
>>>>>       }
>>>>> -    spin_unlock(&pdev->vpci->lock);
>>>>>   
>>>>>       return data & (0xffffffff >> (32 - 8 * size));
>>>>>   }
>>>> Here and ...
>>>>
>>>>> @@ -475,13 +498,12 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>>>>>               break;
>>>>>           ASSERT(data_offset < size);
>>>>>       }
>>>>> +    spin_unlock(&pdev->vpci_lock);
>>>>>   
>>>>>       if ( data_offset < size )
>>>>>           /* Tailing gap, write the remaining. */
>>>>>           vpci_write_hw(sbdf, reg + data_offset, size - data_offset,
>>>>>                         data >> (data_offset * 8));
>>>>> -
>>>>> -    spin_unlock(&pdev->vpci->lock);
>>>>>   }
>>>> ... even more so here I'm not sure of the correctness of the moving
>>>> you do: While pdev->vpci indeed doesn't get accessed, I wonder
>>>> whether there wasn't an intention to avoid racing calls to
>>>> vpci_{read,write}_hw() this way. In any event I think such movement
>>>> would need justification in the description.
>>> I agree about the need for justification in the commit message, or
>>> even better this being split into a pre-patch, as it's not related to
>>> the lock switching done here.
>>>
>>> I do think this is fine however, as racing calls to
>>> vpci_{read,write}_hw() shouldn't be a problem. Those are just wrappers
>>> around pci_conf_{read,write} functions, and the required locking (in
>>> case of using the IO ports) is already taken care in
>>> pci_conf_{read,write}.
>> IOW you consider it acceptable for a guest (really: Dom0) read racing
>> a write to read back only part of what was written (so far)? I would
>> think individual multi-byte reads and writes should appear atomic to
>> the guest.
> We split 64bit writes into two 32bit ones without taking the lock for
> the whole duration of the access, so it's already possible to see a
> partially updated state as a result of a 64bit write.
>
> I'm going over the PCI(e) spec but I don't seem to find anything about
> whether the ECAM is allowed to split memory transactions into multiple
> Configuration Requests, and whether those could then interleave with
> requests from a different CPU.
So, with the above is it still fine for you to have the change as is or
you want this optimization to go into a dedicated patch before this one?
>
> Thanks, Roger.
Thank you,
Oleksandr
Oleksandr Andrushchenko Jan. 31, 2022, 7:41 a.m. UTC | #11
Hi, Jan, Roger!

On 26.01.22 13:13, Roger Pau Monné wrote:
> On Wed, Jan 26, 2022 at 08:40:09AM +0000, Oleksandr Andrushchenko wrote:
>> Hello, Roger, Jan!
>>
>> On 12.01.22 16:42, Jan Beulich wrote:
>>> On 11.01.2022 16:17, Roger Pau Monné wrote:
>>>> On Thu, Nov 25, 2021 at 01:02:40PM +0200, Oleksandr Andrushchenko wrote:
>>>>> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
>>>>> index 657697fe3406..ceaac4516ff8 100644
>>>>> --- a/xen/drivers/vpci/vpci.c
>>>>> +++ b/xen/drivers/vpci/vpci.c
>>>>> @@ -35,12 +35,10 @@ extern vpci_register_init_t *const __start_vpci_array[];
>>>>>    extern vpci_register_init_t *const __end_vpci_array[];
>>>>>    #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
>>>>>    
>>>>> -void vpci_remove_device(struct pci_dev *pdev)
>>>>> +static void vpci_remove_device_handlers_locked(struct pci_dev *pdev)
>>>>>    {
>>>>> -    if ( !has_vpci(pdev->domain) )
>>>>> -        return;
>>>>> +    ASSERT(spin_is_locked(&pdev->vpci_lock));
>>>>>    
>>>>> -    spin_lock(&pdev->vpci->lock);
>>>>>        while ( !list_empty(&pdev->vpci->handlers) )
>>>>>        {
>>>>>            struct vpci_register *r = list_first_entry(&pdev->vpci->handlers,
>>>>> @@ -50,15 +48,33 @@ void vpci_remove_device(struct pci_dev *pdev)
>>>>>            list_del(&r->node);
>>>>>            xfree(r);
>>>>>        }
>>>>> -    spin_unlock(&pdev->vpci->lock);
>>>>> +}
>>>>> +
>>>>> +void vpci_remove_device_locked(struct pci_dev *pdev)
>>>> I think this could be static instead, as it's only used by
>>>> vpci_remove_device and vpci_add_handlers which are local to the
>>>> file.
>> This is going to be used outside later on while processing pending mappings,
>> so I think it is not worth it defining it static here and then removing the static
>> key word later on: please see [PATCH v5 04/14] vpci: cancel pending map/unmap on vpci removal [1]
> I have some comments there also, which might change the approach
> you are using.
>
>>> Does the splitting out of vpci_remove_device_handlers_locked() belong in
>>> this patch in the first place? There's no second caller being added, so
>>> this looks to be an orthogonal adjustment.
>> I think of it as a preparation for the upcoming code: although the reason for the
>> change might not be immediately seen in this patch it is still in line with what
>> happens next.
> Right - it's generally best if the change is done together as the new
> callers are added. Otherwise it's hard to understand why certain changes
> are made, and you will likely get asked the same question on next
> rounds.
>
> It's also possible that the code that requires this is changed in
> further iterations so there's no longer a need for the splitting.
Ok, sounds reasonable
I will not split the functions without the obvious need
>
> Thanks, Roger.
Thank you,
Oleksandr
Roger Pau Monné Jan. 31, 2022, 8:56 a.m. UTC | #12
On Fri, Jan 28, 2022 at 02:15:08PM +0000, Oleksandr Andrushchenko wrote:
> Hi, Roger, Jan!
> 
> On 13.01.22 10:58, Roger Pau Monné wrote:
> > On Wed, Jan 12, 2022 at 04:52:51PM +0100, Jan Beulich wrote:
> >> On 12.01.2022 16:42, Roger Pau Monné wrote:
> >>> On Wed, Jan 12, 2022 at 03:57:36PM +0100, Jan Beulich wrote:
> >>>> On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
> >>>>> @@ -379,7 +396,6 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
> >>>>>   
> >>>>>           data = merge_result(data, tmp_data, size - data_offset, data_offset);
> >>>>>       }
> >>>>> -    spin_unlock(&pdev->vpci->lock);
> >>>>>   
> >>>>>       return data & (0xffffffff >> (32 - 8 * size));
> >>>>>   }
> >>>> Here and ...
> >>>>
> >>>>> @@ -475,13 +498,12 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
> >>>>>               break;
> >>>>>           ASSERT(data_offset < size);
> >>>>>       }
> >>>>> +    spin_unlock(&pdev->vpci_lock);
> >>>>>   
> >>>>>       if ( data_offset < size )
> >>>>>           /* Tailing gap, write the remaining. */
> >>>>>           vpci_write_hw(sbdf, reg + data_offset, size - data_offset,
> >>>>>                         data >> (data_offset * 8));
> >>>>> -
> >>>>> -    spin_unlock(&pdev->vpci->lock);
> >>>>>   }
> >>>> ... even more so here I'm not sure of the correctness of the moving
> >>>> you do: While pdev->vpci indeed doesn't get accessed, I wonder
> >>>> whether there wasn't an intention to avoid racing calls to
> >>>> vpci_{read,write}_hw() this way. In any event I think such movement
> >>>> would need justification in the description.
> >>> I agree about the need for justification in the commit message, or
> >>> even better this being split into a pre-patch, as it's not related to
> >>> the lock switching done here.
> >>>
> >>> I do think this is fine however, as racing calls to
> >>> vpci_{read,write}_hw() shouldn't be a problem. Those are just wrappers
> >>> around pci_conf_{read,write} functions, and the required locking (in
> >>> case of using the IO ports) is already taken care in
> >>> pci_conf_{read,write}.
> >> IOW you consider it acceptable for a guest (really: Dom0) read racing
> >> a write to read back only part of what was written (so far)? I would
> >> think individual multi-byte reads and writes should appear atomic to
> >> the guest.
> > We split 64bit writes into two 32bit ones without taking the lock for
> > the whole duration of the access, so it's already possible to see a
> > partially updated state as a result of a 64bit write.
> >
> > I'm going over the PCI(e) spec but I don't seem to find anything about
> > whether the ECAM is allowed to split memory transactions into multiple
> > Configuration Requests, and whether those could then interleave with
> > requests from a different CPU.
> So, with the above is it still fine for you to have the change as is or
> you want this optimization to go into a dedicated patch before this one?

The change seems slightly controversial, so I think it would be best
if it was split into a separate patch with a proper reasoning in the
commit message.

Thanks, Roger.
Oleksandr Andrushchenko Jan. 31, 2022, 9 a.m. UTC | #13
Hi, Roger!

On 31.01.22 10:56, Roger Pau Monné wrote:
> On Fri, Jan 28, 2022 at 02:15:08PM +0000, Oleksandr Andrushchenko wrote:
>> Hi, Roger, Jan!
>>
>> On 13.01.22 10:58, Roger Pau Monné wrote:
>>> On Wed, Jan 12, 2022 at 04:52:51PM +0100, Jan Beulich wrote:
>>>> On 12.01.2022 16:42, Roger Pau Monné wrote:
>>>>> On Wed, Jan 12, 2022 at 03:57:36PM +0100, Jan Beulich wrote:
>>>>>> On 25.11.2021 12:02, Oleksandr Andrushchenko wrote:
>>>>>>> @@ -379,7 +396,6 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
>>>>>>>    
>>>>>>>            data = merge_result(data, tmp_data, size - data_offset, data_offset);
>>>>>>>        }
>>>>>>> -    spin_unlock(&pdev->vpci->lock);
>>>>>>>    
>>>>>>>        return data & (0xffffffff >> (32 - 8 * size));
>>>>>>>    }
>>>>>> Here and ...
>>>>>>
>>>>>>> @@ -475,13 +498,12 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>>>>>>>                break;
>>>>>>>            ASSERT(data_offset < size);
>>>>>>>        }
>>>>>>> +    spin_unlock(&pdev->vpci_lock);
>>>>>>>    
>>>>>>>        if ( data_offset < size )
>>>>>>>            /* Tailing gap, write the remaining. */
>>>>>>>            vpci_write_hw(sbdf, reg + data_offset, size - data_offset,
>>>>>>>                          data >> (data_offset * 8));
>>>>>>> -
>>>>>>> -    spin_unlock(&pdev->vpci->lock);
>>>>>>>    }
>>>>>> ... even more so here I'm not sure of the correctness of the moving
>>>>>> you do: While pdev->vpci indeed doesn't get accessed, I wonder
>>>>>> whether there wasn't an intention to avoid racing calls to
>>>>>> vpci_{read,write}_hw() this way. In any event I think such movement
>>>>>> would need justification in the description.
>>>>> I agree about the need for justification in the commit message, or
>>>>> even better this being split into a pre-patch, as it's not related to
>>>>> the lock switching done here.
>>>>>
>>>>> I do think this is fine however, as racing calls to
>>>>> vpci_{read,write}_hw() shouldn't be a problem. Those are just wrappers
>>>>> around pci_conf_{read,write} functions, and the required locking (in
>>>>> case of using the IO ports) is already taken care in
>>>>> pci_conf_{read,write}.
>>>> IOW you consider it acceptable for a guest (really: Dom0) read racing
>>>> a write to read back only part of what was written (so far)? I would
>>>> think individual multi-byte reads and writes should appear atomic to
>>>> the guest.
>>> We split 64bit writes into two 32bit ones without taking the lock for
>>> the whole duration of the access, so it's already possible to see a
>>> partially updated state as a result of a 64bit write.
>>>
>>> I'm going over the PCI(e) spec but I don't seem to find anything about
>>> whether the ECAM is allowed to split memory transactions into multiple
>>> Configuration Requests, and whether those could then interleave with
>>> requests from a different CPU.
>> So, with the above is it still fine for you to have the change as is or
>> you want this optimization to go into a dedicated patch before this one?
> The change seems slightly controversial, so I think it would be best
> if it was split into a separate patch with a proper reasoning in the
> commit message.
Sure, will move into a dedicated patch then
>
> Thanks, Roger.
Thank you,
Oleksandr
diff mbox series

Patch

diff --git a/tools/tests/vpci/emul.h b/tools/tests/vpci/emul.h
index 2e1d3057c9d8..d018fb5eef21 100644
--- a/tools/tests/vpci/emul.h
+++ b/tools/tests/vpci/emul.h
@@ -44,6 +44,7 @@  struct domain {
 };
 
 struct pci_dev {
+    bool vpci_lock;
     struct vpci *vpci;
 };
 
@@ -53,10 +54,8 @@  struct vcpu
 };
 
 extern const struct vcpu *current;
-extern const struct pci_dev test_pdev;
+extern struct pci_dev test_pdev;
 
-typedef bool spinlock_t;
-#define spin_lock_init(l) (*(l) = false)
 #define spin_lock(l) (*(l) = true)
 #define spin_unlock(l) (*(l) = false)
 
diff --git a/tools/tests/vpci/main.c b/tools/tests/vpci/main.c
index b9a0a6006bb9..26c95b08b6b1 100644
--- a/tools/tests/vpci/main.c
+++ b/tools/tests/vpci/main.c
@@ -23,7 +23,8 @@  static struct vpci vpci;
 
 const static struct domain d;
 
-const struct pci_dev test_pdev = {
+struct pci_dev test_pdev = {
+    .vpci_lock = false,
     .vpci = &vpci,
 };
 
@@ -158,7 +159,6 @@  main(int argc, char **argv)
     int rc;
 
     INIT_LIST_HEAD(&vpci.handlers);
-    spin_lock_init(&vpci.lock);
 
     VPCI_ADD_REG(vpci_read32, vpci_write32, 0, 4, r0);
     VPCI_READ_CHECK(0, 4, r0);
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 13e2a190b439..1f7a37f78264 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -910,14 +910,14 @@  int vpci_msix_arch_print(const struct vpci_msix *msix)
         {
             struct pci_dev *pdev = msix->pdev;
 
-            spin_unlock(&msix->pdev->vpci->lock);
+            spin_unlock(&msix->pdev->vpci_lock);
             process_pending_softirqs();
             /* NB: we assume that pdev cannot go away for an alive domain. */
-            if ( !pdev->vpci || !spin_trylock(&pdev->vpci->lock) )
+            if ( !spin_trylock(&pdev->vpci_lock) )
                 return -EBUSY;
-            if ( pdev->vpci->msix != msix )
+            if ( !pdev->vpci || pdev->vpci->msix != msix )
             {
-                spin_unlock(&pdev->vpci->lock);
+                spin_unlock(&pdev->vpci_lock);
                 return -EAGAIN;
             }
         }
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index a9d31293ac09..286808b25e65 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -328,6 +328,7 @@  static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
     *((u8*) &pdev->bus) = bus;
     *((u8*) &pdev->devfn) = devfn;
     pdev->domain = NULL;
+    spin_lock_init(&pdev->vpci_lock);
 
     arch_pci_init_pdev(pdev);
 
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 40ff79c33f8f..bd23c0274d48 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -142,12 +142,13 @@  bool vpci_process_pending(struct vcpu *v)
         if ( rc == -ERESTART )
             return true;
 
-        spin_lock(&v->vpci.pdev->vpci->lock);
-        /* Disable memory decoding unconditionally on failure. */
-        modify_decoding(v->vpci.pdev,
-                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
-                        !rc && v->vpci.rom_only);
-        spin_unlock(&v->vpci.pdev->vpci->lock);
+        spin_lock(&v->vpci.pdev->vpci_lock);
+        if ( v->vpci.pdev->vpci )
+            /* Disable memory decoding unconditionally on failure. */
+            modify_decoding(v->vpci.pdev,
+                            rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
+                            !rc && v->vpci.rom_only);
+        spin_unlock(&v->vpci.pdev->vpci_lock);
 
         rangeset_destroy(v->vpci.mem);
         v->vpci.mem = NULL;
@@ -285,6 +286,12 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
                 continue;
         }
 
+        spin_lock(&tmp->vpci_lock);
+        if ( !tmp->vpci )
+        {
+            spin_unlock(&tmp->vpci_lock);
+            continue;
+        }
         for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
         {
             const struct vpci_bar *bar = &tmp->vpci->header.bars[i];
@@ -303,12 +310,14 @@  static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
             rc = rangeset_remove_range(mem, start, end);
             if ( rc )
             {
+                spin_unlock(&tmp->vpci_lock);
                 printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
                        start, end, rc);
                 rangeset_destroy(mem);
                 return rc;
             }
         }
+        spin_unlock(&tmp->vpci_lock);
     }
 
     ASSERT(dev);
diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
index 5757a7aed20f..e3ce46869dad 100644
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -270,7 +270,7 @@  void vpci_dump_msi(void)
     rcu_read_lock(&domlist_read_lock);
     for_each_domain ( d )
     {
-        const struct pci_dev *pdev;
+        struct pci_dev *pdev;
 
         if ( !has_vpci(d) )
             continue;
@@ -282,8 +282,13 @@  void vpci_dump_msi(void)
             const struct vpci_msi *msi;
             const struct vpci_msix *msix;
 
-            if ( !pdev->vpci || !spin_trylock(&pdev->vpci->lock) )
+            if ( !spin_trylock(&pdev->vpci_lock) )
                 continue;
+            if ( !pdev->vpci )
+            {
+                spin_unlock(&pdev->vpci_lock);
+                continue;
+            }
 
             msi = pdev->vpci->msi;
             if ( msi && msi->enabled )
@@ -323,7 +328,7 @@  void vpci_dump_msi(void)
                 }
             }
 
-            spin_unlock(&pdev->vpci->lock);
+            spin_unlock(&pdev->vpci_lock);
             process_pending_softirqs();
         }
     }
diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
index 846f1b8d7038..5310cc3ff520 100644
--- a/xen/drivers/vpci/msix.c
+++ b/xen/drivers/vpci/msix.c
@@ -225,7 +225,7 @@  static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
         return X86EMUL_OKAY;
     }
 
-    spin_lock(&msix->pdev->vpci->lock);
+    spin_lock(&msix->pdev->vpci_lock);
     entry = get_entry(msix, addr);
     offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
 
@@ -254,7 +254,7 @@  static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
         ASSERT_UNREACHABLE();
         break;
     }
-    spin_unlock(&msix->pdev->vpci->lock);
+    spin_unlock(&msix->pdev->vpci_lock);
 
     return X86EMUL_OKAY;
 }
@@ -297,7 +297,7 @@  static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
         return X86EMUL_OKAY;
     }
 
-    spin_lock(&msix->pdev->vpci->lock);
+    spin_lock(&msix->pdev->vpci_lock);
     entry = get_entry(msix, addr);
     offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
 
@@ -370,7 +370,7 @@  static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
         ASSERT_UNREACHABLE();
         break;
     }
-    spin_unlock(&msix->pdev->vpci->lock);
+    spin_unlock(&msix->pdev->vpci_lock);
 
     return X86EMUL_OKAY;
 }
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 657697fe3406..ceaac4516ff8 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -35,12 +35,10 @@  extern vpci_register_init_t *const __start_vpci_array[];
 extern vpci_register_init_t *const __end_vpci_array[];
 #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array)
 
-void vpci_remove_device(struct pci_dev *pdev)
+static void vpci_remove_device_handlers_locked(struct pci_dev *pdev)
 {
-    if ( !has_vpci(pdev->domain) )
-        return;
+    ASSERT(spin_is_locked(&pdev->vpci_lock));
 
-    spin_lock(&pdev->vpci->lock);
     while ( !list_empty(&pdev->vpci->handlers) )
     {
         struct vpci_register *r = list_first_entry(&pdev->vpci->handlers,
@@ -50,15 +48,33 @@  void vpci_remove_device(struct pci_dev *pdev)
         list_del(&r->node);
         xfree(r);
     }
-    spin_unlock(&pdev->vpci->lock);
+}
+
+void vpci_remove_device_locked(struct pci_dev *pdev)
+{
+    ASSERT(spin_is_locked(&pdev->vpci_lock));
+
+    vpci_remove_device_handlers_locked(pdev);
     xfree(pdev->vpci->msix);
     xfree(pdev->vpci->msi);
     xfree(pdev->vpci);
     pdev->vpci = NULL;
 }
 
+void vpci_remove_device(struct pci_dev *pdev)
+{
+    if ( !has_vpci(pdev->domain) )
+        return;
+
+    spin_lock(&pdev->vpci_lock);
+    if ( pdev->vpci )
+        vpci_remove_device_locked(pdev);
+    spin_unlock(&pdev->vpci_lock);
+}
+
 int vpci_add_handlers(struct pci_dev *pdev)
 {
+    struct vpci *vpci;
     unsigned int i;
     int rc = 0;
 
@@ -68,12 +84,13 @@  int vpci_add_handlers(struct pci_dev *pdev)
     /* We should not get here twice for the same device. */
     ASSERT(!pdev->vpci);
 
-    pdev->vpci = xzalloc(struct vpci);
-    if ( !pdev->vpci )
+    vpci = xzalloc(struct vpci);
+    if ( !vpci )
         return -ENOMEM;
 
+    spin_lock(&pdev->vpci_lock);
+    pdev->vpci = vpci;
     INIT_LIST_HEAD(&pdev->vpci->handlers);
-    spin_lock_init(&pdev->vpci->lock);
 
     for ( i = 0; i < NUM_VPCI_INIT; i++ )
     {
@@ -83,7 +100,8 @@  int vpci_add_handlers(struct pci_dev *pdev)
     }
 
     if ( rc )
-        vpci_remove_device(pdev);
+        vpci_remove_device_locked(pdev);
+    spin_unlock(&pdev->vpci_lock);
 
     return rc;
 }
@@ -152,8 +170,6 @@  int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
     r->offset = offset;
     r->private = data;
 
-    spin_lock(&vpci->lock);
-
     /* The list of handlers must be kept sorted at all times. */
     list_for_each ( prev, &vpci->handlers )
     {
@@ -165,14 +181,12 @@  int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler,
             break;
         if ( cmp == 0 )
         {
-            spin_unlock(&vpci->lock);
             xfree(r);
             return -EEXIST;
         }
     }
 
     list_add_tail(&r->node, prev);
-    spin_unlock(&vpci->lock);
 
     return 0;
 }
@@ -183,7 +197,6 @@  int vpci_remove_register(struct vpci *vpci, unsigned int offset,
     const struct vpci_register r = { .offset = offset, .size = size };
     struct vpci_register *rm;
 
-    spin_lock(&vpci->lock);
     list_for_each_entry ( rm, &vpci->handlers, node )
     {
         int cmp = vpci_register_cmp(&r, rm);
@@ -195,14 +208,12 @@  int vpci_remove_register(struct vpci *vpci, unsigned int offset,
         if ( !cmp && rm->offset == offset && rm->size == size )
         {
             list_del(&rm->node);
-            spin_unlock(&vpci->lock);
             xfree(rm);
             return 0;
         }
         if ( cmp <= 0 )
             break;
     }
-    spin_unlock(&vpci->lock);
 
     return -ENOENT;
 }
@@ -311,7 +322,7 @@  static uint32_t merge_result(uint32_t data, uint32_t new, unsigned int size,
 uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
 {
     const struct domain *d = current->domain;
-    const struct pci_dev *pdev;
+    struct pci_dev *pdev;
     const struct vpci_register *r;
     unsigned int data_offset = 0;
     uint32_t data = ~(uint32_t)0;
@@ -327,7 +338,12 @@  uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
     if ( !pdev )
         return vpci_read_hw(sbdf, reg, size);
 
-    spin_lock(&pdev->vpci->lock);
+    spin_lock(&pdev->vpci_lock);
+    if ( !pdev->vpci )
+    {
+        spin_unlock(&pdev->vpci_lock);
+        return vpci_read_hw(sbdf, reg, size);
+    }
 
     /* Read from the hardware or the emulated register handlers. */
     list_for_each_entry ( r, &pdev->vpci->handlers, node )
@@ -370,6 +386,7 @@  uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
             break;
         ASSERT(data_offset < size);
     }
+    spin_unlock(&pdev->vpci_lock);
 
     if ( data_offset < size )
     {
@@ -379,7 +396,6 @@  uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
 
         data = merge_result(data, tmp_data, size - data_offset, data_offset);
     }
-    spin_unlock(&pdev->vpci->lock);
 
     return data & (0xffffffff >> (32 - 8 * size));
 }
@@ -414,7 +430,7 @@  void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
                 uint32_t data)
 {
     const struct domain *d = current->domain;
-    const struct pci_dev *pdev;
+    struct pci_dev *pdev;
     const struct vpci_register *r;
     unsigned int data_offset = 0;
     const unsigned long *ro_map = pci_get_ro_map(sbdf.seg);
@@ -440,7 +456,14 @@  void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
         return;
     }
 
-    spin_lock(&pdev->vpci->lock);
+    spin_lock(&pdev->vpci_lock);
+    if ( !pdev->vpci )
+    {
+        spin_unlock(&pdev->vpci_lock);
+        vpci_write_hw(sbdf, reg, size, data);
+        return;
+    }
+
 
     /* Write the value to the hardware or emulated registers. */
     list_for_each_entry ( r, &pdev->vpci->handlers, node )
@@ -475,13 +498,12 @@  void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
             break;
         ASSERT(data_offset < size);
     }
+    spin_unlock(&pdev->vpci_lock);
 
     if ( data_offset < size )
         /* Tailing gap, write the remaining. */
         vpci_write_hw(sbdf, reg + data_offset, size - data_offset,
                       data >> (data_offset * 8));
-
-    spin_unlock(&pdev->vpci->lock);
 }
 
 /* Helper function to check an access size and alignment on vpci space. */
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index b6d7e454f814..3f60d6c6c6dd 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -134,6 +134,7 @@  struct pci_dev {
     u64 vf_rlen[6];
 
     /* Data for vPCI. */
+    spinlock_t vpci_lock;
     struct vpci *vpci;
 };
 
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 3f32de9d7eb3..8b22bdef11d0 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -30,8 +30,9 @@  int __must_check vpci_add_handlers(struct pci_dev *dev);
 
 /* Remove all handlers and free vpci related structures. */
 void vpci_remove_device(struct pci_dev *pdev);
+void vpci_remove_device_locked(struct pci_dev *pdev);
 
-/* Add/remove a register handler. */
+/* Add/remove a register handler. Must be called holding the vpci_lock. */
 int __must_check vpci_add_register(struct vpci *vpci,
                                    vpci_read_t *read_handler,
                                    vpci_write_t *write_handler,
@@ -60,7 +61,6 @@  bool __must_check vpci_process_pending(struct vcpu *v);
 struct vpci {
     /* List of vPCI handlers for a device. */
     struct list_head handlers;
-    spinlock_t lock;
 
 #ifdef __XEN__
     /* Hide the rest of the vpci struct from the user-space test harness. */
@@ -231,6 +231,7 @@  static inline int vpci_add_handlers(struct pci_dev *pdev)
 }
 
 static inline void vpci_remove_device(struct pci_dev *pdev) { }
+static inline void vpci_remove_device_locked(struct pci_dev *pdev) { }
 
 static inline void vpci_dump_msi(void) { }