Message ID | 20220204063459.680961-4-andr2000@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | PCI devices passthrough on Arm, part 3 | expand |
On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: > @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > continue; > } > > + spin_lock(&tmp->vpci_lock); > + if ( !tmp->vpci ) > + { > + spin_unlock(&tmp->vpci_lock); > + continue; > + } > for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) > { > const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; > @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > rc = rangeset_remove_range(mem, start, end); > if ( rc ) > { > + spin_unlock(&tmp->vpci_lock); > printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", > start, end, rc); > rangeset_destroy(mem); > return rc; > } > } > + spin_unlock(&tmp->vpci_lock); > } At the first glance this simply looks like another unjustified (in the description) change, as you're not converting anything here but you actually add locking (and I realize this was there before, so I'm sorry for not pointing this out earlier). But then I wonder whether you actually tested this, since I can't help getting the impression that you're introducing a live-lock: The function is called from cmd_write() and rom_write(), which in turn are called out of vpci_write(). Yet that function already holds the lock, and the lock is not (currently) recursive. (For the 3rd caller of the function - init_bars() - otoh the locking looks to be entirely unnecessary.) Then again this was present already even in Roger's original patch, so I guess I must be missing something ... > --- a/xen/drivers/vpci/msix.c > +++ b/xen/drivers/vpci/msix.c > @@ -138,7 +138,7 @@ static void control_write(const struct pci_dev *pdev, unsigned int reg, > pci_conf_write16(pdev->sbdf, reg, val); > } > > -static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr) > +static struct vpci_msix *msix_get(const struct domain *d, unsigned long addr) > { > struct vpci_msix *msix; > > @@ -150,15 +150,29 @@ static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr) > for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ ) > if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled && > VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) ) > + { > + spin_lock(&msix->pdev->vpci_lock); > return msix; > + } I think deliberately returning with a lock held requires a respective comment ahead of the function. > } > > return NULL; > } > > +static void msix_put(struct vpci_msix *msix) > +{ > + if ( !msix ) > + return; > + > + spin_unlock(&msix->pdev->vpci_lock); > +} Maybe shorter if ( msix ) spin_unlock(&msix->pdev->vpci_lock); ? Yet there's only one case where you may pass NULL in here, so maybe it's better anyway to move the conditional ... > static int msix_accept(struct vcpu *v, unsigned long addr) > { > - return !!msix_find(v->domain, addr); > + struct vpci_msix *msix = msix_get(v->domain, addr); > + > + msix_put(msix); > + return !!msix; > } ... here? > @@ -186,7 +200,7 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, > unsigned long *data) > { > const struct domain *d = v->domain; > - struct vpci_msix *msix = msix_find(d, addr); > + struct vpci_msix *msix = msix_get(d, addr); > const struct vpci_msix_entry *entry; > unsigned int offset; > > @@ -196,7 +210,10 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, > return X86EMUL_RETRY; > > if ( !access_allowed(msix->pdev, addr, len) ) > + { > + msix_put(msix); > return X86EMUL_OKAY; > + } > > if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) ) > { > @@ -222,10 +239,10 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, > break; > } > > + msix_put(msix); > return X86EMUL_OKAY; > } > > - spin_lock(&msix->pdev->vpci->lock); > entry = get_entry(msix, addr); > offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); You're increasing the locked region quite a bit here. If this is really needed, it wants explaining. And if this is deemed acceptable as a "side effect", it wants justifying or at least stating imo. Same for msix_write() then, obviously. (I'm not sure Roger actually implied this when suggesting to switch to the get/put pair.) > @@ -327,7 +334,12 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) > if ( !pdev ) > return vpci_read_hw(sbdf, reg, size); > > - spin_lock(&pdev->vpci->lock); > + spin_lock(&pdev->vpci_lock); > + if ( !pdev->vpci ) > + { > + spin_unlock(&pdev->vpci_lock); > + return vpci_read_hw(sbdf, reg, size); > + } Didn't you say you would add justification of this part of the change (and its vpci_write() counterpart) to the description? Jan
Hi, Jan! On 04.02.22 09:52, Jan Beulich wrote: > On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: > > At the first glance this simply looks like another unjustified (in the > description) change, as you're not converting anything here but you > actually add locking (and I realize this was there before, so I'm sorry > for not pointing this out earlier). But then I wonder whether you > actually tested this This is already stated in the cover letter that I have tested two x86 configurations and tested that on Arm....... Would you like to see the relevant logs? Thank you, Oleksandr
On 04.02.2022 09:13, Oleksandr Andrushchenko wrote: > On 04.02.22 09:52, Jan Beulich wrote: >> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >> >> At the first glance this simply looks like another unjustified (in the >> description) change, as you're not converting anything here but you >> actually add locking (and I realize this was there before, so I'm sorry >> for not pointing this out earlier). But then I wonder whether you >> actually tested this > This is already stated in the cover letter that I have tested two x86 > configurations and tested that on Arm....... Okay, I'm sorry then. But could you then please point out where I'm wrong with my analysis? Jan
Hi, Jan! On 04.02.22 09:52, Jan Beulich wrote: > On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >> continue; >> } >> >> + spin_lock(&tmp->vpci_lock); >> + if ( !tmp->vpci ) >> + { >> + spin_unlock(&tmp->vpci_lock); >> + continue; >> + } >> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >> { >> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >> rc = rangeset_remove_range(mem, start, end); >> if ( rc ) >> { >> + spin_unlock(&tmp->vpci_lock); >> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >> start, end, rc); >> rangeset_destroy(mem); >> return rc; >> } >> } >> + spin_unlock(&tmp->vpci_lock); >> } > At the first glance this simply looks like another unjustified (in the > description) change, as you're not converting anything here but you > actually add locking (and I realize this was there before, so I'm sorry > for not pointing this out earlier). Well, I thought that the description already has "...the lock can be used (and in a few cases is used right away) to check whether vpci is present" and this is enough for such uses as here. > But then I wonder whether you > actually tested this, since I can't help getting the impression that > you're introducing a live-lock: The function is called from cmd_write() > and rom_write(), which in turn are called out of vpci_write(). Yet that > function already holds the lock, and the lock is not (currently) > recursive. (For the 3rd caller of the function - init_bars() - otoh > the locking looks to be entirely unnecessary.) Well, you are correct: if tmp != pdev then it is correct to acquire the lock. But if tmp == pdev and rom_only == true then we'll deadlock. It seems we need to have the locking conditional, e.g. only lock if tmp != pdev > > Then again this was present already even in Roger's original patch, so > I guess I must be missing something ... > >> --- a/xen/drivers/vpci/msix.c >> +++ b/xen/drivers/vpci/msix.c >> @@ -138,7 +138,7 @@ static void control_write(const struct pci_dev *pdev, unsigned int reg, >> pci_conf_write16(pdev->sbdf, reg, val); >> } >> >> -static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr) >> +static struct vpci_msix *msix_get(const struct domain *d, unsigned long addr) >> { >> struct vpci_msix *msix; >> >> @@ -150,15 +150,29 @@ static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr) >> for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ ) >> if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled && >> VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) ) >> + { >> + spin_lock(&msix->pdev->vpci_lock); >> return msix; >> + } > I think deliberately returning with a lock held requires a respective > comment ahead of the function. Ok, will add a comment > >> } >> >> return NULL; >> } >> >> +static void msix_put(struct vpci_msix *msix) >> +{ >> + if ( !msix ) >> + return; >> + >> + spin_unlock(&msix->pdev->vpci_lock); >> +} > Maybe shorter > > if ( msix ) > spin_unlock(&msix->pdev->vpci_lock); Looks good > > ? Yet there's only one case where you may pass NULL in here, so > maybe it's better anyway to move the conditional ... > >> static int msix_accept(struct vcpu *v, unsigned long addr) >> { >> - return !!msix_find(v->domain, addr); >> + struct vpci_msix *msix = msix_get(v->domain, addr); >> + >> + msix_put(msix); >> + return !!msix; >> } > ... here? Yes, I can have that check here, but what if there is yet another caller of the same? I am not sure whether it is better to have the check in msix_get or at the caller site. At the moment (with a single place with NULL possible) I can move the check. @Roger? > >> @@ -186,7 +200,7 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, >> unsigned long *data) >> { >> const struct domain *d = v->domain; >> - struct vpci_msix *msix = msix_find(d, addr); >> + struct vpci_msix *msix = msix_get(d, addr); >> const struct vpci_msix_entry *entry; >> unsigned int offset; >> >> @@ -196,7 +210,10 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, >> return X86EMUL_RETRY; >> >> if ( !access_allowed(msix->pdev, addr, len) ) >> + { >> + msix_put(msix); >> return X86EMUL_OKAY; >> + } >> >> if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) ) >> { >> @@ -222,10 +239,10 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, >> break; >> } >> >> + msix_put(msix); >> return X86EMUL_OKAY; >> } >> >> - spin_lock(&msix->pdev->vpci->lock); >> entry = get_entry(msix, addr); >> offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); > You're increasing the locked region quite a bit here. If this is really > needed, it wants explaining. And if this is deemed acceptable as a > "side effect", it wants justifying or at least stating imo. Same for > msix_write() then, obviously. Yes, I do increase the locking region here, but the msix variable needs to be protected all the time, so it seems to be obvious that it remains under the lock > (I'm not sure Roger actually implied this > when suggesting to switch to the get/put pair.) > >> @@ -327,7 +334,12 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) >> if ( !pdev ) >> return vpci_read_hw(sbdf, reg, size); >> >> - spin_lock(&pdev->vpci->lock); >> + spin_lock(&pdev->vpci_lock); >> + if ( !pdev->vpci ) >> + { >> + spin_unlock(&pdev->vpci_lock); >> + return vpci_read_hw(sbdf, reg, size); >> + } > Didn't you say you would add justification of this part of the change > (and its vpci_write() counterpart) to the description? Again, I am referring to the commit message as described above > > Jan > Thank you, Oleksandr
On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: > On 04.02.22 09:52, Jan Beulich wrote: >> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>> continue; >>> } >>> >>> + spin_lock(&tmp->vpci_lock); >>> + if ( !tmp->vpci ) >>> + { >>> + spin_unlock(&tmp->vpci_lock); >>> + continue; >>> + } >>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>> { >>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>> rc = rangeset_remove_range(mem, start, end); >>> if ( rc ) >>> { >>> + spin_unlock(&tmp->vpci_lock); >>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>> start, end, rc); >>> rangeset_destroy(mem); >>> return rc; >>> } >>> } >>> + spin_unlock(&tmp->vpci_lock); >>> } >> At the first glance this simply looks like another unjustified (in the >> description) change, as you're not converting anything here but you >> actually add locking (and I realize this was there before, so I'm sorry >> for not pointing this out earlier). > Well, I thought that the description already has "...the lock can be > used (and in a few cases is used right away) to check whether vpci > is present" and this is enough for such uses as here. >> But then I wonder whether you >> actually tested this, since I can't help getting the impression that >> you're introducing a live-lock: The function is called from cmd_write() >> and rom_write(), which in turn are called out of vpci_write(). Yet that >> function already holds the lock, and the lock is not (currently) >> recursive. (For the 3rd caller of the function - init_bars() - otoh >> the locking looks to be entirely unnecessary.) > Well, you are correct: if tmp != pdev then it is correct to acquire > the lock. But if tmp == pdev and rom_only == true > then we'll deadlock. > > It seems we need to have the locking conditional, e.g. only lock > if tmp != pdev Which will address the live-lock, but introduce ABBA deadlock potential between the two locks. >>> @@ -222,10 +239,10 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, >>> break; >>> } >>> >>> + msix_put(msix); >>> return X86EMUL_OKAY; >>> } >>> >>> - spin_lock(&msix->pdev->vpci->lock); >>> entry = get_entry(msix, addr); >>> offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); >> You're increasing the locked region quite a bit here. If this is really >> needed, it wants explaining. And if this is deemed acceptable as a >> "side effect", it wants justifying or at least stating imo. Same for >> msix_write() then, obviously. > Yes, I do increase the locking region here, but the msix variable needs > to be protected all the time, so it seems to be obvious that it remains > under the lock What does the msix variable have to do with the vPCI lock? If you see a need to grow the locked region here, then surely this is independent of your conversion of the lock, and hence wants to be a prereq fix (which may in fact want/need backporting). >>> @@ -327,7 +334,12 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) >>> if ( !pdev ) >>> return vpci_read_hw(sbdf, reg, size); >>> >>> - spin_lock(&pdev->vpci->lock); >>> + spin_lock(&pdev->vpci_lock); >>> + if ( !pdev->vpci ) >>> + { >>> + spin_unlock(&pdev->vpci_lock); >>> + return vpci_read_hw(sbdf, reg, size); >>> + } >> Didn't you say you would add justification of this part of the change >> (and its vpci_write() counterpart) to the description? > Again, I am referring to the commit message as described above No, sorry - that part applies only to what inside the parentheses of if(). But on the intermediate version (post-v5 in a 4-patch series) I did say: "In this case as well as in its write counterpart it becomes even more important to justify (in the description) the new behavior. It is not obvious at all that the absence of a struct vpci should be taken as an indication that the underlying device needs accessing instead. This also cannot be inferred from the "!pdev" case visible in context. In that case we have no record of a device at this SBDF, and hence the fallback pretty clearly is a "just in case" one. Yet if we know of a device, the absence of a struct vpci may mean various possible things." If it wasn't obvious: The comment was on the use of vpci_read_hw() on this path, not redundant with the earlier one regarding the added "is vpci non-NULL" in a few places. Jan
Hi, Jan! On 04.02.22 11:15, Jan Beulich wrote: > On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >> On 04.02.22 09:52, Jan Beulich wrote: >>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>> continue; >>>> } >>>> >>>> + spin_lock(&tmp->vpci_lock); >>>> + if ( !tmp->vpci ) >>>> + { >>>> + spin_unlock(&tmp->vpci_lock); >>>> + continue; >>>> + } >>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>> { >>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>> rc = rangeset_remove_range(mem, start, end); >>>> if ( rc ) >>>> { >>>> + spin_unlock(&tmp->vpci_lock); >>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>> start, end, rc); >>>> rangeset_destroy(mem); >>>> return rc; >>>> } >>>> } >>>> + spin_unlock(&tmp->vpci_lock); >>>> } >>> At the first glance this simply looks like another unjustified (in the >>> description) change, as you're not converting anything here but you >>> actually add locking (and I realize this was there before, so I'm sorry >>> for not pointing this out earlier). >> Well, I thought that the description already has "...the lock can be >> used (and in a few cases is used right away) to check whether vpci >> is present" and this is enough for such uses as here. >>> But then I wonder whether you >>> actually tested this, since I can't help getting the impression that >>> you're introducing a live-lock: The function is called from cmd_write() >>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>> function already holds the lock, and the lock is not (currently) >>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>> the locking looks to be entirely unnecessary.) >> Well, you are correct: if tmp != pdev then it is correct to acquire >> the lock. But if tmp == pdev and rom_only == true >> then we'll deadlock. >> >> It seems we need to have the locking conditional, e.g. only lock >> if tmp != pdev > Which will address the live-lock, but introduce ABBA deadlock potential > between the two locks. I am not sure I can suggest a better solution here @Roger, @Jan, could you please help here? > >>>> @@ -222,10 +239,10 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, >>>> break; >>>> } >>>> >>>> + msix_put(msix); >>>> return X86EMUL_OKAY; >>>> } >>>> >>>> - spin_lock(&msix->pdev->vpci->lock); >>>> entry = get_entry(msix, addr); >>>> offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); >>> You're increasing the locked region quite a bit here. If this is really >>> needed, it wants explaining. And if this is deemed acceptable as a >>> "side effect", it wants justifying or at least stating imo. Same for >>> msix_write() then, obviously. >> Yes, I do increase the locking region here, but the msix variable needs >> to be protected all the time, so it seems to be obvious that it remains >> under the lock > What does the msix variable have to do with the vPCI lock? If you see > a need to grow the locked region here, then surely this is independent > of your conversion of the lock, and hence wants to be a prereq fix > (which may in fact want/need backporting). First of all, the implementation of msix_get is wrong and needs to be: /* * Note: if vpci_msix found, then this function returns with * pdev->vpci_lock held. Use msix_put to unlock. */ static struct vpci_msix *msix_get(const struct domain *d, unsigned long addr) { struct vpci_msix *msix; list_for_each_entry ( msix, &d->arch.hvm.msix_tables, next ) { const struct vpci_bar *bars; unsigned int i; spin_lock(&msix->pdev->vpci_lock); if ( !msix->pdev->vpci ) { spin_unlock(&msix->pdev->vpci_lock); continue; } bars = msix->pdev->vpci->header.bars; for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ ) if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled && VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) ) return msix; spin_unlock(&msix->pdev->vpci_lock); } return NULL; } Then, both msix_{read|write} can dereference msix->pdev->vpci early, this is why Roger suggested we move to msix_{get|put} here. And yes, we grow the locked region here and yes this might want a prereq fix. Or just be fixed while at it. > >>>> @@ -327,7 +334,12 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) >>>> if ( !pdev ) >>>> return vpci_read_hw(sbdf, reg, size); >>>> >>>> - spin_lock(&pdev->vpci->lock); >>>> + spin_lock(&pdev->vpci_lock); >>>> + if ( !pdev->vpci ) >>>> + { >>>> + spin_unlock(&pdev->vpci_lock); >>>> + return vpci_read_hw(sbdf, reg, size); >>>> + } >>> Didn't you say you would add justification of this part of the change >>> (and its vpci_write() counterpart) to the description? >> Again, I am referring to the commit message as described above > No, sorry - that part applies only to what inside the parentheses of > if(). But on the intermediate version (post-v5 in a 4-patch series) I > did say: > > "In this case as well as in its write counterpart it becomes even more > important to justify (in the description) the new behavior. It is not > obvious at all that the absence of a struct vpci should be taken as > an indication that the underlying device needs accessing instead. > This also cannot be inferred from the "!pdev" case visible in context. > In that case we have no record of a device at this SBDF, and hence the > fallback pretty clearly is a "just in case" one. Yet if we know of a > device, the absence of a struct vpci may mean various possible things." > > If it wasn't obvious: The comment was on the use of vpci_read_hw() on > this path, not redundant with the earlier one regarding the added > "is vpci non-NULL" in a few places. Ok > > Jan > Thank you, Oleksandr
On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: > On 04.02.22 11:15, Jan Beulich wrote: >> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>> On 04.02.22 09:52, Jan Beulich wrote: >>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>> continue; >>>>> } >>>>> >>>>> + spin_lock(&tmp->vpci_lock); >>>>> + if ( !tmp->vpci ) >>>>> + { >>>>> + spin_unlock(&tmp->vpci_lock); >>>>> + continue; >>>>> + } >>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>> { >>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>> rc = rangeset_remove_range(mem, start, end); >>>>> if ( rc ) >>>>> { >>>>> + spin_unlock(&tmp->vpci_lock); >>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>> start, end, rc); >>>>> rangeset_destroy(mem); >>>>> return rc; >>>>> } >>>>> } >>>>> + spin_unlock(&tmp->vpci_lock); >>>>> } >>>> At the first glance this simply looks like another unjustified (in the >>>> description) change, as you're not converting anything here but you >>>> actually add locking (and I realize this was there before, so I'm sorry >>>> for not pointing this out earlier). >>> Well, I thought that the description already has "...the lock can be >>> used (and in a few cases is used right away) to check whether vpci >>> is present" and this is enough for such uses as here. >>>> But then I wonder whether you >>>> actually tested this, since I can't help getting the impression that >>>> you're introducing a live-lock: The function is called from cmd_write() >>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>> function already holds the lock, and the lock is not (currently) >>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>> the locking looks to be entirely unnecessary.) >>> Well, you are correct: if tmp != pdev then it is correct to acquire >>> the lock. But if tmp == pdev and rom_only == true >>> then we'll deadlock. >>> >>> It seems we need to have the locking conditional, e.g. only lock >>> if tmp != pdev >> Which will address the live-lock, but introduce ABBA deadlock potential >> between the two locks. > I am not sure I can suggest a better solution here > @Roger, @Jan, could you please help here? Well, first of all I'd like to mention that while it may have been okay to not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing with DomU-s' lists of PCI devices. The requirement really applies to the other use of for_each_pdev() as well (in vpci_dump_msi()), except that there it probably wants to be a try-lock. Next I'd like to point out that here we have the still pending issue of how to deal with hidden devices, which Dom0 can access. See my RFC patch "vPCI: account for hidden devices in modify_bars()". Whatever the solution here, I think it wants to at least account for the extra need there. Now it is quite clear that pcidevs_lock isn't going to help with avoiding the deadlock, as it's imo not an option at all to acquire that lock everywhere else you access ->vpci (or else the vpci lock itself would be pointless). But a per-domain auxiliary r/w lock may help: Other paths would acquire it in read mode, and here you'd acquire it in write mode (in the former case around the vpci lock, while in the latter case there may then not be any need to acquire the individual vpci locks at all). FTAOD: I haven't fully thought through all implications (and hence whether this is viable in the first place); I expect you will, documenting what you've found in the resulting patch description. Of course the double lock acquire/release would then likely want hiding in helper functions. Jan
On Fri, Feb 04, 2022 at 10:12:46AM +0000, Oleksandr Andrushchenko wrote: > Hi, Jan! > > On 04.02.22 11:15, Jan Beulich wrote: > > On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: > >> On 04.02.22 09:52, Jan Beulich wrote: > >>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: > >>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>> continue; > >>>> } > >>>> > >>>> + spin_lock(&tmp->vpci_lock); > >>>> + if ( !tmp->vpci ) > >>>> + { > >>>> + spin_unlock(&tmp->vpci_lock); > >>>> + continue; > >>>> + } > >>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) > >>>> { > >>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; > >>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>> rc = rangeset_remove_range(mem, start, end); > >>>> if ( rc ) > >>>> { > >>>> + spin_unlock(&tmp->vpci_lock); > >>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", > >>>> start, end, rc); > >>>> rangeset_destroy(mem); > >>>> return rc; > >>>> } > >>>> } > >>>> + spin_unlock(&tmp->vpci_lock); > >>>> } > >>> At the first glance this simply looks like another unjustified (in the > >>> description) change, as you're not converting anything here but you > >>> actually add locking (and I realize this was there before, so I'm sorry > >>> for not pointing this out earlier). > >> Well, I thought that the description already has "...the lock can be > >> used (and in a few cases is used right away) to check whether vpci > >> is present" and this is enough for such uses as here. > >>> But then I wonder whether you > >>> actually tested this, since I can't help getting the impression that > >>> you're introducing a live-lock: The function is called from cmd_write() > >>> and rom_write(), which in turn are called out of vpci_write(). Yet that > >>> function already holds the lock, and the lock is not (currently) > >>> recursive. (For the 3rd caller of the function - init_bars() - otoh > >>> the locking looks to be entirely unnecessary.) > >> Well, you are correct: if tmp != pdev then it is correct to acquire > >> the lock. But if tmp == pdev and rom_only == true > >> then we'll deadlock. > >> > >> It seems we need to have the locking conditional, e.g. only lock > >> if tmp != pdev > > Which will address the live-lock, but introduce ABBA deadlock potential > > between the two locks. > I am not sure I can suggest a better solution here > @Roger, @Jan, could you please help here? I think we could set the locking order based on the memory address of the locks, ie: if ( &tmp->vpci_lock < &pdev->vpci_lock ) { spin_unlock(&pdev->vpci_lock); spin_lock(&tmp->vpci_lock); spin_lock(&pdev->vpci_lock); if ( !pdev->vpci || &pdev->vpci->header != header ) /* ERROR: vpci removed or recreated. */ } else spin_lock(&tmp->vpci_lock); That however creates a window where the address of the BARs on the current device (pdev) could be changed, so the result of the mapping might be skewed. I think the guest would only have itself to blame for that, as changing the position of the BARs while toggling memory decoding is not something sensible to do. > > > >>>> @@ -222,10 +239,10 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, > >>>> break; > >>>> } > >>>> > >>>> + msix_put(msix); > >>>> return X86EMUL_OKAY; > >>>> } > >>>> > >>>> - spin_lock(&msix->pdev->vpci->lock); > >>>> entry = get_entry(msix, addr); > >>>> offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); > >>> You're increasing the locked region quite a bit here. If this is really > >>> needed, it wants explaining. And if this is deemed acceptable as a > >>> "side effect", it wants justifying or at least stating imo. Same for > >>> msix_write() then, obviously. > >> Yes, I do increase the locking region here, but the msix variable needs > >> to be protected all the time, so it seems to be obvious that it remains > >> under the lock > > What does the msix variable have to do with the vPCI lock? If you see > > a need to grow the locked region here, then surely this is independent > > of your conversion of the lock, and hence wants to be a prereq fix > > (which may in fact want/need backporting). > First of all, the implementation of msix_get is wrong and needs to be: > > /* > * Note: if vpci_msix found, then this function returns with > * pdev->vpci_lock held. Use msix_put to unlock. > */ > static struct vpci_msix *msix_get(const struct domain *d, unsigned long addr) > { > struct vpci_msix *msix; > > list_for_each_entry ( msix, &d->arch.hvm.msix_tables, next ) Strictly speaking you would also need to introduce a lock here to protect msix_tables. This was all designed when hot-adding (or removing) PCI devices to the domain wasn't supported. > { > const struct vpci_bar *bars; > unsigned int i; > > spin_lock(&msix->pdev->vpci_lock); > if ( !msix->pdev->vpci ) > { > spin_unlock(&msix->pdev->vpci_lock); > continue; > } > > bars = msix->pdev->vpci->header.bars; > for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ ) > if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled && > VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) ) > return msix; > > spin_unlock(&msix->pdev->vpci_lock); > } > > return NULL; > } > > Then, both msix_{read|write} can dereference msix->pdev->vpci early, > this is why Roger suggested we move to msix_{get|put} here. > And yes, we grow the locked region here and yes this might want a > prereq fix. Or just be fixed while at it. Ideally yes, we would need a separate fix that introduced msix_{get,put}, because the currently unlocked regions of msix_{read,write} do access the BAR address fields, and doing so without holding the vpci lock would be racy. I would expect that the writing/reading of the addr field is done in a single instruction, so it's unlikely to be a problem in practice. That's kind of similar to the fact that modify_bars also accesses the addr and size fields of remote BARs without taking the respective lock. Once the lock is moved outside of the vpci struct and it's used to assert that pdev->vpci is present then we do need to hold it while accessing vpci, or else the struct could be removed under our feet. Roger.
On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: > On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: > > On 04.02.22 11:15, Jan Beulich wrote: > >> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: > >>> On 04.02.22 09:52, Jan Beulich wrote: > >>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: > >>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>> continue; > >>>>> } > >>>>> > >>>>> + spin_lock(&tmp->vpci_lock); > >>>>> + if ( !tmp->vpci ) > >>>>> + { > >>>>> + spin_unlock(&tmp->vpci_lock); > >>>>> + continue; > >>>>> + } > >>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) > >>>>> { > >>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; > >>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>> rc = rangeset_remove_range(mem, start, end); > >>>>> if ( rc ) > >>>>> { > >>>>> + spin_unlock(&tmp->vpci_lock); > >>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", > >>>>> start, end, rc); > >>>>> rangeset_destroy(mem); > >>>>> return rc; > >>>>> } > >>>>> } > >>>>> + spin_unlock(&tmp->vpci_lock); > >>>>> } > >>>> At the first glance this simply looks like another unjustified (in the > >>>> description) change, as you're not converting anything here but you > >>>> actually add locking (and I realize this was there before, so I'm sorry > >>>> for not pointing this out earlier). > >>> Well, I thought that the description already has "...the lock can be > >>> used (and in a few cases is used right away) to check whether vpci > >>> is present" and this is enough for such uses as here. > >>>> But then I wonder whether you > >>>> actually tested this, since I can't help getting the impression that > >>>> you're introducing a live-lock: The function is called from cmd_write() > >>>> and rom_write(), which in turn are called out of vpci_write(). Yet that > >>>> function already holds the lock, and the lock is not (currently) > >>>> recursive. (For the 3rd caller of the function - init_bars() - otoh > >>>> the locking looks to be entirely unnecessary.) > >>> Well, you are correct: if tmp != pdev then it is correct to acquire > >>> the lock. But if tmp == pdev and rom_only == true > >>> then we'll deadlock. > >>> > >>> It seems we need to have the locking conditional, e.g. only lock > >>> if tmp != pdev > >> Which will address the live-lock, but introduce ABBA deadlock potential > >> between the two locks. > > I am not sure I can suggest a better solution here > > @Roger, @Jan, could you please help here? > > Well, first of all I'd like to mention that while it may have been okay to > not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing > with DomU-s' lists of PCI devices. The requirement really applies to the > other use of for_each_pdev() as well (in vpci_dump_msi()), except that > there it probably wants to be a try-lock. > > Next I'd like to point out that here we have the still pending issue of > how to deal with hidden devices, which Dom0 can access. See my RFC patch > "vPCI: account for hidden devices in modify_bars()". Whatever the solution > here, I think it wants to at least account for the extra need there. Yes, sorry, I should take care of that. > Now it is quite clear that pcidevs_lock isn't going to help with avoiding > the deadlock, as it's imo not an option at all to acquire that lock > everywhere else you access ->vpci (or else the vpci lock itself would be > pointless). But a per-domain auxiliary r/w lock may help: Other paths > would acquire it in read mode, and here you'd acquire it in write mode (in > the former case around the vpci lock, while in the latter case there may > then not be any need to acquire the individual vpci locks at all). FTAOD: > I haven't fully thought through all implications (and hence whether this is > viable in the first place); I expect you will, documenting what you've > found in the resulting patch description. Of course the double lock > acquire/release would then likely want hiding in helper functions. I've been also thinking about this, and whether it's really worth to have a per-device lock rather than a per-domain one that protects all vpci regions of the devices assigned to the domain. The OS is likely to serialize accesses to the PCI config space anyway, and the only place I could see a benefit of having per-device locks is in the handling of MSI-X tables, as the handling of the mask bit is likely very performance sensitive, so adding a per-domain lock there could be a bottleneck. We could alternatively do a per-domain rwlock for vpci and special case the MSI-X area to also have a per-device specific lock. At which point it becomes fairly similar to what you propose. Thanks, Roger.
On 04.02.2022 12:13, Roger Pau Monné wrote: > On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: >> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: >>> On 04.02.22 11:15, Jan Beulich wrote: >>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>>>> On 04.02.22 09:52, Jan Beulich wrote: >>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>> continue; >>>>>>> } >>>>>>> >>>>>>> + spin_lock(&tmp->vpci_lock); >>>>>>> + if ( !tmp->vpci ) >>>>>>> + { >>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>> + continue; >>>>>>> + } >>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>>>> { >>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>> rc = rangeset_remove_range(mem, start, end); >>>>>>> if ( rc ) >>>>>>> { >>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>>>> start, end, rc); >>>>>>> rangeset_destroy(mem); >>>>>>> return rc; >>>>>>> } >>>>>>> } >>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>> } >>>>>> At the first glance this simply looks like another unjustified (in the >>>>>> description) change, as you're not converting anything here but you >>>>>> actually add locking (and I realize this was there before, so I'm sorry >>>>>> for not pointing this out earlier). >>>>> Well, I thought that the description already has "...the lock can be >>>>> used (and in a few cases is used right away) to check whether vpci >>>>> is present" and this is enough for such uses as here. >>>>>> But then I wonder whether you >>>>>> actually tested this, since I can't help getting the impression that >>>>>> you're introducing a live-lock: The function is called from cmd_write() >>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>>>> function already holds the lock, and the lock is not (currently) >>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>>>> the locking looks to be entirely unnecessary.) >>>>> Well, you are correct: if tmp != pdev then it is correct to acquire >>>>> the lock. But if tmp == pdev and rom_only == true >>>>> then we'll deadlock. >>>>> >>>>> It seems we need to have the locking conditional, e.g. only lock >>>>> if tmp != pdev >>>> Which will address the live-lock, but introduce ABBA deadlock potential >>>> between the two locks. >>> I am not sure I can suggest a better solution here >>> @Roger, @Jan, could you please help here? >> >> Well, first of all I'd like to mention that while it may have been okay to >> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing >> with DomU-s' lists of PCI devices. The requirement really applies to the >> other use of for_each_pdev() as well (in vpci_dump_msi()), except that >> there it probably wants to be a try-lock. >> >> Next I'd like to point out that here we have the still pending issue of >> how to deal with hidden devices, which Dom0 can access. See my RFC patch >> "vPCI: account for hidden devices in modify_bars()". Whatever the solution >> here, I think it wants to at least account for the extra need there. > > Yes, sorry, I should take care of that. > >> Now it is quite clear that pcidevs_lock isn't going to help with avoiding >> the deadlock, as it's imo not an option at all to acquire that lock >> everywhere else you access ->vpci (or else the vpci lock itself would be >> pointless). But a per-domain auxiliary r/w lock may help: Other paths >> would acquire it in read mode, and here you'd acquire it in write mode (in >> the former case around the vpci lock, while in the latter case there may >> then not be any need to acquire the individual vpci locks at all). FTAOD: >> I haven't fully thought through all implications (and hence whether this is >> viable in the first place); I expect you will, documenting what you've >> found in the resulting patch description. Of course the double lock >> acquire/release would then likely want hiding in helper functions. > > I've been also thinking about this, and whether it's really worth to > have a per-device lock rather than a per-domain one that protects all > vpci regions of the devices assigned to the domain. > > The OS is likely to serialize accesses to the PCI config space anyway, > and the only place I could see a benefit of having per-device locks is > in the handling of MSI-X tables, as the handling of the mask bit is > likely very performance sensitive, so adding a per-domain lock there > could be a bottleneck. Hmm, with method 1 accesses serializing globally is basically unavoidable, but with MMCFG I see no reason why OSes may not (move to) permit(ting) parallel accesses, with serialization perhaps done only at device level. See our own pci_config_lock, which applies to only method 1 accesses; we don't look to be serializing MMCFG accesses at all. > We could alternatively do a per-domain rwlock for vpci and special case > the MSI-X area to also have a per-device specific lock. At which point > it becomes fairly similar to what you propose. Indeed. Jan
On 04.02.22 13:13, Roger Pau Monné wrote: > On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: >> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: >>> On 04.02.22 11:15, Jan Beulich wrote: >>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>>>> On 04.02.22 09:52, Jan Beulich wrote: >>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>> continue; >>>>>>> } >>>>>>> >>>>>>> + spin_lock(&tmp->vpci_lock); >>>>>>> + if ( !tmp->vpci ) >>>>>>> + { >>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>> + continue; >>>>>>> + } >>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>>>> { >>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>> rc = rangeset_remove_range(mem, start, end); >>>>>>> if ( rc ) >>>>>>> { >>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>>>> start, end, rc); >>>>>>> rangeset_destroy(mem); >>>>>>> return rc; >>>>>>> } >>>>>>> } >>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>> } >>>>>> At the first glance this simply looks like another unjustified (in the >>>>>> description) change, as you're not converting anything here but you >>>>>> actually add locking (and I realize this was there before, so I'm sorry >>>>>> for not pointing this out earlier). >>>>> Well, I thought that the description already has "...the lock can be >>>>> used (and in a few cases is used right away) to check whether vpci >>>>> is present" and this is enough for such uses as here. >>>>>> But then I wonder whether you >>>>>> actually tested this, since I can't help getting the impression that >>>>>> you're introducing a live-lock: The function is called from cmd_write() >>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>>>> function already holds the lock, and the lock is not (currently) >>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>>>> the locking looks to be entirely unnecessary.) >>>>> Well, you are correct: if tmp != pdev then it is correct to acquire >>>>> the lock. But if tmp == pdev and rom_only == true >>>>> then we'll deadlock. >>>>> >>>>> It seems we need to have the locking conditional, e.g. only lock >>>>> if tmp != pdev >>>> Which will address the live-lock, but introduce ABBA deadlock potential >>>> between the two locks. >>> I am not sure I can suggest a better solution here >>> @Roger, @Jan, could you please help here? >> Well, first of all I'd like to mention that while it may have been okay to >> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing >> with DomU-s' lists of PCI devices. The requirement really applies to the >> other use of for_each_pdev() as well (in vpci_dump_msi()), except that >> there it probably wants to be a try-lock. >> >> Next I'd like to point out that here we have the still pending issue of >> how to deal with hidden devices, which Dom0 can access. See my RFC patch >> "vPCI: account for hidden devices in modify_bars()". Whatever the solution >> here, I think it wants to at least account for the extra need there. > Yes, sorry, I should take care of that. > >> Now it is quite clear that pcidevs_lock isn't going to help with avoiding >> the deadlock, as it's imo not an option at all to acquire that lock >> everywhere else you access ->vpci (or else the vpci lock itself would be >> pointless). But a per-domain auxiliary r/w lock may help: Other paths >> would acquire it in read mode, and here you'd acquire it in write mode (in >> the former case around the vpci lock, while in the latter case there may >> then not be any need to acquire the individual vpci locks at all). FTAOD: >> I haven't fully thought through all implications (and hence whether this is >> viable in the first place); I expect you will, documenting what you've >> found in the resulting patch description. Of course the double lock >> acquire/release would then likely want hiding in helper functions. > I've been also thinking about this, and whether it's really worth to > have a per-device lock rather than a per-domain one that protects all > vpci regions of the devices assigned to the domain. > > The OS is likely to serialize accesses to the PCI config space anyway, > and the only place I could see a benefit of having per-device locks is > in the handling of MSI-X tables, as the handling of the mask bit is > likely very performance sensitive, so adding a per-domain lock there > could be a bottleneck. > > We could alternatively do a per-domain rwlock for vpci and special case > the MSI-X area to also have a per-device specific lock. At which point > it becomes fairly similar to what you propose. I need a decision. Please. > > Thanks, Roger. Thank you, Oleksandr
On Fri, Feb 04, 2022 at 11:37:50AM +0000, Oleksandr Andrushchenko wrote: > > > On 04.02.22 13:13, Roger Pau Monné wrote: > > On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: > >> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: > >>> On 04.02.22 11:15, Jan Beulich wrote: > >>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: > >>>>> On 04.02.22 09:52, Jan Beulich wrote: > >>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: > >>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>>>> continue; > >>>>>>> } > >>>>>>> > >>>>>>> + spin_lock(&tmp->vpci_lock); > >>>>>>> + if ( !tmp->vpci ) > >>>>>>> + { > >>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>> + continue; > >>>>>>> + } > >>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) > >>>>>>> { > >>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; > >>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>>>> rc = rangeset_remove_range(mem, start, end); > >>>>>>> if ( rc ) > >>>>>>> { > >>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", > >>>>>>> start, end, rc); > >>>>>>> rangeset_destroy(mem); > >>>>>>> return rc; > >>>>>>> } > >>>>>>> } > >>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>> } > >>>>>> At the first glance this simply looks like another unjustified (in the > >>>>>> description) change, as you're not converting anything here but you > >>>>>> actually add locking (and I realize this was there before, so I'm sorry > >>>>>> for not pointing this out earlier). > >>>>> Well, I thought that the description already has "...the lock can be > >>>>> used (and in a few cases is used right away) to check whether vpci > >>>>> is present" and this is enough for such uses as here. > >>>>>> But then I wonder whether you > >>>>>> actually tested this, since I can't help getting the impression that > >>>>>> you're introducing a live-lock: The function is called from cmd_write() > >>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that > >>>>>> function already holds the lock, and the lock is not (currently) > >>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh > >>>>>> the locking looks to be entirely unnecessary.) > >>>>> Well, you are correct: if tmp != pdev then it is correct to acquire > >>>>> the lock. But if tmp == pdev and rom_only == true > >>>>> then we'll deadlock. > >>>>> > >>>>> It seems we need to have the locking conditional, e.g. only lock > >>>>> if tmp != pdev > >>>> Which will address the live-lock, but introduce ABBA deadlock potential > >>>> between the two locks. > >>> I am not sure I can suggest a better solution here > >>> @Roger, @Jan, could you please help here? > >> Well, first of all I'd like to mention that while it may have been okay to > >> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing > >> with DomU-s' lists of PCI devices. The requirement really applies to the > >> other use of for_each_pdev() as well (in vpci_dump_msi()), except that > >> there it probably wants to be a try-lock. > >> > >> Next I'd like to point out that here we have the still pending issue of > >> how to deal with hidden devices, which Dom0 can access. See my RFC patch > >> "vPCI: account for hidden devices in modify_bars()". Whatever the solution > >> here, I think it wants to at least account for the extra need there. > > Yes, sorry, I should take care of that. > > > >> Now it is quite clear that pcidevs_lock isn't going to help with avoiding > >> the deadlock, as it's imo not an option at all to acquire that lock > >> everywhere else you access ->vpci (or else the vpci lock itself would be > >> pointless). But a per-domain auxiliary r/w lock may help: Other paths > >> would acquire it in read mode, and here you'd acquire it in write mode (in > >> the former case around the vpci lock, while in the latter case there may > >> then not be any need to acquire the individual vpci locks at all). FTAOD: > >> I haven't fully thought through all implications (and hence whether this is > >> viable in the first place); I expect you will, documenting what you've > >> found in the resulting patch description. Of course the double lock > >> acquire/release would then likely want hiding in helper functions. > > I've been also thinking about this, and whether it's really worth to > > have a per-device lock rather than a per-domain one that protects all > > vpci regions of the devices assigned to the domain. > > > > The OS is likely to serialize accesses to the PCI config space anyway, > > and the only place I could see a benefit of having per-device locks is > > in the handling of MSI-X tables, as the handling of the mask bit is > > likely very performance sensitive, so adding a per-domain lock there > > could be a bottleneck. > > > > We could alternatively do a per-domain rwlock for vpci and special case > > the MSI-X area to also have a per-device specific lock. At which point > > it becomes fairly similar to what you propose. > I need a decision. > Please. I'm afraid that's up to you. I cannot assure that any of the proposed options will actually be viable until someone attempts to implement them. I wouldn't want to impose a solution to you because I cannot guarantee it will work or result in better code than other options. I think there are two options: 1. Set a lock ordering for double locking (based on the memory address of the lock for example). 2. Introduce a per-domain rwlock that protects all of the devices assigned to a domain. Thanks, Roger.
On 04.02.22 13:37, Jan Beulich wrote: > On 04.02.2022 12:13, Roger Pau Monné wrote: >> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: >>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: >>>> On 04.02.22 11:15, Jan Beulich wrote: >>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>>>>> On 04.02.22 09:52, Jan Beulich wrote: >>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>> continue; >>>>>>>> } >>>>>>>> >>>>>>>> + spin_lock(&tmp->vpci_lock); >>>>>>>> + if ( !tmp->vpci ) >>>>>>>> + { >>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>> + continue; >>>>>>>> + } >>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>>>>> { >>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>> rc = rangeset_remove_range(mem, start, end); >>>>>>>> if ( rc ) >>>>>>>> { >>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>>>>> start, end, rc); >>>>>>>> rangeset_destroy(mem); >>>>>>>> return rc; >>>>>>>> } >>>>>>>> } >>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>> } >>>>>>> At the first glance this simply looks like another unjustified (in the >>>>>>> description) change, as you're not converting anything here but you >>>>>>> actually add locking (and I realize this was there before, so I'm sorry >>>>>>> for not pointing this out earlier). >>>>>> Well, I thought that the description already has "...the lock can be >>>>>> used (and in a few cases is used right away) to check whether vpci >>>>>> is present" and this is enough for such uses as here. >>>>>>> But then I wonder whether you >>>>>>> actually tested this, since I can't help getting the impression that >>>>>>> you're introducing a live-lock: The function is called from cmd_write() >>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>>>>> function already holds the lock, and the lock is not (currently) >>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>>>>> the locking looks to be entirely unnecessary.) >>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire >>>>>> the lock. But if tmp == pdev and rom_only == true >>>>>> then we'll deadlock. >>>>>> >>>>>> It seems we need to have the locking conditional, e.g. only lock >>>>>> if tmp != pdev >>>>> Which will address the live-lock, but introduce ABBA deadlock potential >>>>> between the two locks. >>>> I am not sure I can suggest a better solution here >>>> @Roger, @Jan, could you please help here? >>> Well, first of all I'd like to mention that while it may have been okay to >>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing >>> with DomU-s' lists of PCI devices. The requirement really applies to the >>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that >>> there it probably wants to be a try-lock. >>> >>> Next I'd like to point out that here we have the still pending issue of >>> how to deal with hidden devices, which Dom0 can access. See my RFC patch >>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution >>> here, I think it wants to at least account for the extra need there. >> Yes, sorry, I should take care of that. >> >>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding >>> the deadlock, as it's imo not an option at all to acquire that lock >>> everywhere else you access ->vpci (or else the vpci lock itself would be >>> pointless). But a per-domain auxiliary r/w lock may help: Other paths >>> would acquire it in read mode, and here you'd acquire it in write mode (in >>> the former case around the vpci lock, while in the latter case there may >>> then not be any need to acquire the individual vpci locks at all). FTAOD: >>> I haven't fully thought through all implications (and hence whether this is >>> viable in the first place); I expect you will, documenting what you've >>> found in the resulting patch description. Of course the double lock >>> acquire/release would then likely want hiding in helper functions. >> I've been also thinking about this, and whether it's really worth to >> have a per-device lock rather than a per-domain one that protects all >> vpci regions of the devices assigned to the domain. >> >> The OS is likely to serialize accesses to the PCI config space anyway, >> and the only place I could see a benefit of having per-device locks is >> in the handling of MSI-X tables, as the handling of the mask bit is >> likely very performance sensitive, so adding a per-domain lock there >> could be a bottleneck. > Hmm, with method 1 accesses serializing globally is basically > unavoidable, but with MMCFG I see no reason why OSes may not (move > to) permit(ting) parallel accesses, with serialization perhaps done > only at device level. See our own pci_config_lock, which applies to > only method 1 accesses; we don't look to be serializing MMCFG > accesses at all. > >> We could alternatively do a per-domain rwlock for vpci and special case >> the MSI-X area to also have a per-device specific lock. At which point >> it becomes fairly similar to what you propose. @Jan, @Roger 1. d->vpci_lock - rwlock <- this protects vpci 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables or should it better be pdev->msix_tbl_lock as MSI-X tables don't really depend on vPCI? Does this sound like something that could fly? It takes quite a while to implement and test, so I would like to understand that on the ground yet before putting efforts in it. > Indeed. > > Jan > Thank you in advance, Oleksandr
On 04.02.2022 13:37, Oleksandr Andrushchenko wrote: > > > On 04.02.22 13:37, Jan Beulich wrote: >> On 04.02.2022 12:13, Roger Pau Monné wrote: >>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: >>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: >>>>> On 04.02.22 11:15, Jan Beulich wrote: >>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>>>>>> On 04.02.22 09:52, Jan Beulich wrote: >>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>> continue; >>>>>>>>> } >>>>>>>>> >>>>>>>>> + spin_lock(&tmp->vpci_lock); >>>>>>>>> + if ( !tmp->vpci ) >>>>>>>>> + { >>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>> + continue; >>>>>>>>> + } >>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>>>>>> { >>>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>> rc = rangeset_remove_range(mem, start, end); >>>>>>>>> if ( rc ) >>>>>>>>> { >>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>>>>>> start, end, rc); >>>>>>>>> rangeset_destroy(mem); >>>>>>>>> return rc; >>>>>>>>> } >>>>>>>>> } >>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>> } >>>>>>>> At the first glance this simply looks like another unjustified (in the >>>>>>>> description) change, as you're not converting anything here but you >>>>>>>> actually add locking (and I realize this was there before, so I'm sorry >>>>>>>> for not pointing this out earlier). >>>>>>> Well, I thought that the description already has "...the lock can be >>>>>>> used (and in a few cases is used right away) to check whether vpci >>>>>>> is present" and this is enough for such uses as here. >>>>>>>> But then I wonder whether you >>>>>>>> actually tested this, since I can't help getting the impression that >>>>>>>> you're introducing a live-lock: The function is called from cmd_write() >>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>>>>>> function already holds the lock, and the lock is not (currently) >>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>>>>>> the locking looks to be entirely unnecessary.) >>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire >>>>>>> the lock. But if tmp == pdev and rom_only == true >>>>>>> then we'll deadlock. >>>>>>> >>>>>>> It seems we need to have the locking conditional, e.g. only lock >>>>>>> if tmp != pdev >>>>>> Which will address the live-lock, but introduce ABBA deadlock potential >>>>>> between the two locks. >>>>> I am not sure I can suggest a better solution here >>>>> @Roger, @Jan, could you please help here? >>>> Well, first of all I'd like to mention that while it may have been okay to >>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing >>>> with DomU-s' lists of PCI devices. The requirement really applies to the >>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that >>>> there it probably wants to be a try-lock. >>>> >>>> Next I'd like to point out that here we have the still pending issue of >>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch >>>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution >>>> here, I think it wants to at least account for the extra need there. >>> Yes, sorry, I should take care of that. >>> >>>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding >>>> the deadlock, as it's imo not an option at all to acquire that lock >>>> everywhere else you access ->vpci (or else the vpci lock itself would be >>>> pointless). But a per-domain auxiliary r/w lock may help: Other paths >>>> would acquire it in read mode, and here you'd acquire it in write mode (in >>>> the former case around the vpci lock, while in the latter case there may >>>> then not be any need to acquire the individual vpci locks at all). FTAOD: >>>> I haven't fully thought through all implications (and hence whether this is >>>> viable in the first place); I expect you will, documenting what you've >>>> found in the resulting patch description. Of course the double lock >>>> acquire/release would then likely want hiding in helper functions. >>> I've been also thinking about this, and whether it's really worth to >>> have a per-device lock rather than a per-domain one that protects all >>> vpci regions of the devices assigned to the domain. >>> >>> The OS is likely to serialize accesses to the PCI config space anyway, >>> and the only place I could see a benefit of having per-device locks is >>> in the handling of MSI-X tables, as the handling of the mask bit is >>> likely very performance sensitive, so adding a per-domain lock there >>> could be a bottleneck. >> Hmm, with method 1 accesses serializing globally is basically >> unavoidable, but with MMCFG I see no reason why OSes may not (move >> to) permit(ting) parallel accesses, with serialization perhaps done >> only at device level. See our own pci_config_lock, which applies to >> only method 1 accesses; we don't look to be serializing MMCFG >> accesses at all. >> >>> We could alternatively do a per-domain rwlock for vpci and special case >>> the MSI-X area to also have a per-device specific lock. At which point >>> it becomes fairly similar to what you propose. > @Jan, @Roger > > 1. d->vpci_lock - rwlock <- this protects vpci > 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables > or should it better be pdev->msix_tbl_lock as MSI-X tables don't > really depend on vPCI? If so, perhaps indeed better the latter. But as said in reply to Roger, I'm not convinced (yet) that doing away with the per-device lock is a good move. As said there - we're ourselves doing fully parallel MMCFG accesses, so OSes ought to be fine to do so, too. Jan
On 04.02.22 14:47, Jan Beulich wrote: > On 04.02.2022 13:37, Oleksandr Andrushchenko wrote: >> >> On 04.02.22 13:37, Jan Beulich wrote: >>> On 04.02.2022 12:13, Roger Pau Monné wrote: >>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: >>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: >>>>>> On 04.02.22 11:15, Jan Beulich wrote: >>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>>>>>>> On 04.02.22 09:52, Jan Beulich wrote: >>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>> continue; >>>>>>>>>> } >>>>>>>>>> >>>>>>>>>> + spin_lock(&tmp->vpci_lock); >>>>>>>>>> + if ( !tmp->vpci ) >>>>>>>>>> + { >>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>> + continue; >>>>>>>>>> + } >>>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>>>>>>> { >>>>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>> rc = rangeset_remove_range(mem, start, end); >>>>>>>>>> if ( rc ) >>>>>>>>>> { >>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>>>>>>> start, end, rc); >>>>>>>>>> rangeset_destroy(mem); >>>>>>>>>> return rc; >>>>>>>>>> } >>>>>>>>>> } >>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>> } >>>>>>>>> At the first glance this simply looks like another unjustified (in the >>>>>>>>> description) change, as you're not converting anything here but you >>>>>>>>> actually add locking (and I realize this was there before, so I'm sorry >>>>>>>>> for not pointing this out earlier). >>>>>>>> Well, I thought that the description already has "...the lock can be >>>>>>>> used (and in a few cases is used right away) to check whether vpci >>>>>>>> is present" and this is enough for such uses as here. >>>>>>>>> But then I wonder whether you >>>>>>>>> actually tested this, since I can't help getting the impression that >>>>>>>>> you're introducing a live-lock: The function is called from cmd_write() >>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>>>>>>> function already holds the lock, and the lock is not (currently) >>>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>>>>>>> the locking looks to be entirely unnecessary.) >>>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire >>>>>>>> the lock. But if tmp == pdev and rom_only == true >>>>>>>> then we'll deadlock. >>>>>>>> >>>>>>>> It seems we need to have the locking conditional, e.g. only lock >>>>>>>> if tmp != pdev >>>>>>> Which will address the live-lock, but introduce ABBA deadlock potential >>>>>>> between the two locks. >>>>>> I am not sure I can suggest a better solution here >>>>>> @Roger, @Jan, could you please help here? >>>>> Well, first of all I'd like to mention that while it may have been okay to >>>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing >>>>> with DomU-s' lists of PCI devices. The requirement really applies to the >>>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that >>>>> there it probably wants to be a try-lock. >>>>> >>>>> Next I'd like to point out that here we have the still pending issue of >>>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch >>>>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution >>>>> here, I think it wants to at least account for the extra need there. >>>> Yes, sorry, I should take care of that. >>>> >>>>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding >>>>> the deadlock, as it's imo not an option at all to acquire that lock >>>>> everywhere else you access ->vpci (or else the vpci lock itself would be >>>>> pointless). But a per-domain auxiliary r/w lock may help: Other paths >>>>> would acquire it in read mode, and here you'd acquire it in write mode (in >>>>> the former case around the vpci lock, while in the latter case there may >>>>> then not be any need to acquire the individual vpci locks at all). FTAOD: >>>>> I haven't fully thought through all implications (and hence whether this is >>>>> viable in the first place); I expect you will, documenting what you've >>>>> found in the resulting patch description. Of course the double lock >>>>> acquire/release would then likely want hiding in helper functions. >>>> I've been also thinking about this, and whether it's really worth to >>>> have a per-device lock rather than a per-domain one that protects all >>>> vpci regions of the devices assigned to the domain. >>>> >>>> The OS is likely to serialize accesses to the PCI config space anyway, >>>> and the only place I could see a benefit of having per-device locks is >>>> in the handling of MSI-X tables, as the handling of the mask bit is >>>> likely very performance sensitive, so adding a per-domain lock there >>>> could be a bottleneck. >>> Hmm, with method 1 accesses serializing globally is basically >>> unavoidable, but with MMCFG I see no reason why OSes may not (move >>> to) permit(ting) parallel accesses, with serialization perhaps done >>> only at device level. See our own pci_config_lock, which applies to >>> only method 1 accesses; we don't look to be serializing MMCFG >>> accesses at all. >>> >>>> We could alternatively do a per-domain rwlock for vpci and special case >>>> the MSI-X area to also have a per-device specific lock. At which point >>>> it becomes fairly similar to what you propose. >> @Jan, @Roger >> >> 1. d->vpci_lock - rwlock <- this protects vpci >> 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables >> or should it better be pdev->msix_tbl_lock as MSI-X tables don't >> really depend on vPCI? > If so, perhaps indeed better the latter. But as said in reply to Roger, > I'm not convinced (yet) that doing away with the per-device lock is a > good move. As said there - we're ourselves doing fully parallel MMCFG > accesses, so OSes ought to be fine to do so, too. But with pdev->vpci_lock we face ABBA... > > Jan > >
On 04.02.2022 13:53, Oleksandr Andrushchenko wrote: > > > On 04.02.22 14:47, Jan Beulich wrote: >> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote: >>> >>> On 04.02.22 13:37, Jan Beulich wrote: >>>> On 04.02.2022 12:13, Roger Pau Monné wrote: >>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: >>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: >>>>>>> On 04.02.22 11:15, Jan Beulich wrote: >>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote: >>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>>> continue; >>>>>>>>>>> } >>>>>>>>>>> >>>>>>>>>>> + spin_lock(&tmp->vpci_lock); >>>>>>>>>>> + if ( !tmp->vpci ) >>>>>>>>>>> + { >>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>> + continue; >>>>>>>>>>> + } >>>>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>>>>>>>> { >>>>>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>>> rc = rangeset_remove_range(mem, start, end); >>>>>>>>>>> if ( rc ) >>>>>>>>>>> { >>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>>>>>>>> start, end, rc); >>>>>>>>>>> rangeset_destroy(mem); >>>>>>>>>>> return rc; >>>>>>>>>>> } >>>>>>>>>>> } >>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>> } >>>>>>>>>> At the first glance this simply looks like another unjustified (in the >>>>>>>>>> description) change, as you're not converting anything here but you >>>>>>>>>> actually add locking (and I realize this was there before, so I'm sorry >>>>>>>>>> for not pointing this out earlier). >>>>>>>>> Well, I thought that the description already has "...the lock can be >>>>>>>>> used (and in a few cases is used right away) to check whether vpci >>>>>>>>> is present" and this is enough for such uses as here. >>>>>>>>>> But then I wonder whether you >>>>>>>>>> actually tested this, since I can't help getting the impression that >>>>>>>>>> you're introducing a live-lock: The function is called from cmd_write() >>>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>>>>>>>> function already holds the lock, and the lock is not (currently) >>>>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>>>>>>>> the locking looks to be entirely unnecessary.) >>>>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire >>>>>>>>> the lock. But if tmp == pdev and rom_only == true >>>>>>>>> then we'll deadlock. >>>>>>>>> >>>>>>>>> It seems we need to have the locking conditional, e.g. only lock >>>>>>>>> if tmp != pdev >>>>>>>> Which will address the live-lock, but introduce ABBA deadlock potential >>>>>>>> between the two locks. >>>>>>> I am not sure I can suggest a better solution here >>>>>>> @Roger, @Jan, could you please help here? >>>>>> Well, first of all I'd like to mention that while it may have been okay to >>>>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing >>>>>> with DomU-s' lists of PCI devices. The requirement really applies to the >>>>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that >>>>>> there it probably wants to be a try-lock. >>>>>> >>>>>> Next I'd like to point out that here we have the still pending issue of >>>>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch >>>>>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution >>>>>> here, I think it wants to at least account for the extra need there. >>>>> Yes, sorry, I should take care of that. >>>>> >>>>>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding >>>>>> the deadlock, as it's imo not an option at all to acquire that lock >>>>>> everywhere else you access ->vpci (or else the vpci lock itself would be >>>>>> pointless). But a per-domain auxiliary r/w lock may help: Other paths >>>>>> would acquire it in read mode, and here you'd acquire it in write mode (in >>>>>> the former case around the vpci lock, while in the latter case there may >>>>>> then not be any need to acquire the individual vpci locks at all). FTAOD: >>>>>> I haven't fully thought through all implications (and hence whether this is >>>>>> viable in the first place); I expect you will, documenting what you've >>>>>> found in the resulting patch description. Of course the double lock >>>>>> acquire/release would then likely want hiding in helper functions. >>>>> I've been also thinking about this, and whether it's really worth to >>>>> have a per-device lock rather than a per-domain one that protects all >>>>> vpci regions of the devices assigned to the domain. >>>>> >>>>> The OS is likely to serialize accesses to the PCI config space anyway, >>>>> and the only place I could see a benefit of having per-device locks is >>>>> in the handling of MSI-X tables, as the handling of the mask bit is >>>>> likely very performance sensitive, so adding a per-domain lock there >>>>> could be a bottleneck. >>>> Hmm, with method 1 accesses serializing globally is basically >>>> unavoidable, but with MMCFG I see no reason why OSes may not (move >>>> to) permit(ting) parallel accesses, with serialization perhaps done >>>> only at device level. See our own pci_config_lock, which applies to >>>> only method 1 accesses; we don't look to be serializing MMCFG >>>> accesses at all. >>>> >>>>> We could alternatively do a per-domain rwlock for vpci and special case >>>>> the MSI-X area to also have a per-device specific lock. At which point >>>>> it becomes fairly similar to what you propose. >>> @Jan, @Roger >>> >>> 1. d->vpci_lock - rwlock <- this protects vpci >>> 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables >>> or should it better be pdev->msix_tbl_lock as MSI-X tables don't >>> really depend on vPCI? >> If so, perhaps indeed better the latter. But as said in reply to Roger, >> I'm not convinced (yet) that doing away with the per-device lock is a >> good move. As said there - we're ourselves doing fully parallel MMCFG >> accesses, so OSes ought to be fine to do so, too. > But with pdev->vpci_lock we face ABBA... I didn't say without per-domain r/w lock, did I? I stand by my earlier outline. Jan
On Fri, Feb 04, 2022 at 12:53:20PM +0000, Oleksandr Andrushchenko wrote: > > > On 04.02.22 14:47, Jan Beulich wrote: > > On 04.02.2022 13:37, Oleksandr Andrushchenko wrote: > >> > >> On 04.02.22 13:37, Jan Beulich wrote: > >>> On 04.02.2022 12:13, Roger Pau Monné wrote: > >>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: > >>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: > >>>>>> On 04.02.22 11:15, Jan Beulich wrote: > >>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: > >>>>>>>> On 04.02.22 09:52, Jan Beulich wrote: > >>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: > >>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>>>>>>> continue; > >>>>>>>>>> } > >>>>>>>>>> > >>>>>>>>>> + spin_lock(&tmp->vpci_lock); > >>>>>>>>>> + if ( !tmp->vpci ) > >>>>>>>>>> + { > >>>>>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>>>>> + continue; > >>>>>>>>>> + } > >>>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) > >>>>>>>>>> { > >>>>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; > >>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>>>>>>> rc = rangeset_remove_range(mem, start, end); > >>>>>>>>>> if ( rc ) > >>>>>>>>>> { > >>>>>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", > >>>>>>>>>> start, end, rc); > >>>>>>>>>> rangeset_destroy(mem); > >>>>>>>>>> return rc; > >>>>>>>>>> } > >>>>>>>>>> } > >>>>>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>>>>> } > >>>>>>>>> At the first glance this simply looks like another unjustified (in the > >>>>>>>>> description) change, as you're not converting anything here but you > >>>>>>>>> actually add locking (and I realize this was there before, so I'm sorry > >>>>>>>>> for not pointing this out earlier). > >>>>>>>> Well, I thought that the description already has "...the lock can be > >>>>>>>> used (and in a few cases is used right away) to check whether vpci > >>>>>>>> is present" and this is enough for such uses as here. > >>>>>>>>> But then I wonder whether you > >>>>>>>>> actually tested this, since I can't help getting the impression that > >>>>>>>>> you're introducing a live-lock: The function is called from cmd_write() > >>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that > >>>>>>>>> function already holds the lock, and the lock is not (currently) > >>>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh > >>>>>>>>> the locking looks to be entirely unnecessary.) > >>>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire > >>>>>>>> the lock. But if tmp == pdev and rom_only == true > >>>>>>>> then we'll deadlock. > >>>>>>>> > >>>>>>>> It seems we need to have the locking conditional, e.g. only lock > >>>>>>>> if tmp != pdev > >>>>>>> Which will address the live-lock, but introduce ABBA deadlock potential > >>>>>>> between the two locks. > >>>>>> I am not sure I can suggest a better solution here > >>>>>> @Roger, @Jan, could you please help here? > >>>>> Well, first of all I'd like to mention that while it may have been okay to > >>>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing > >>>>> with DomU-s' lists of PCI devices. The requirement really applies to the > >>>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that > >>>>> there it probably wants to be a try-lock. > >>>>> > >>>>> Next I'd like to point out that here we have the still pending issue of > >>>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch > >>>>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution > >>>>> here, I think it wants to at least account for the extra need there. > >>>> Yes, sorry, I should take care of that. > >>>> > >>>>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding > >>>>> the deadlock, as it's imo not an option at all to acquire that lock > >>>>> everywhere else you access ->vpci (or else the vpci lock itself would be > >>>>> pointless). But a per-domain auxiliary r/w lock may help: Other paths > >>>>> would acquire it in read mode, and here you'd acquire it in write mode (in > >>>>> the former case around the vpci lock, while in the latter case there may > >>>>> then not be any need to acquire the individual vpci locks at all). FTAOD: > >>>>> I haven't fully thought through all implications (and hence whether this is > >>>>> viable in the first place); I expect you will, documenting what you've > >>>>> found in the resulting patch description. Of course the double lock > >>>>> acquire/release would then likely want hiding in helper functions. > >>>> I've been also thinking about this, and whether it's really worth to > >>>> have a per-device lock rather than a per-domain one that protects all > >>>> vpci regions of the devices assigned to the domain. > >>>> > >>>> The OS is likely to serialize accesses to the PCI config space anyway, > >>>> and the only place I could see a benefit of having per-device locks is > >>>> in the handling of MSI-X tables, as the handling of the mask bit is > >>>> likely very performance sensitive, so adding a per-domain lock there > >>>> could be a bottleneck. > >>> Hmm, with method 1 accesses serializing globally is basically > >>> unavoidable, but with MMCFG I see no reason why OSes may not (move > >>> to) permit(ting) parallel accesses, with serialization perhaps done > >>> only at device level. See our own pci_config_lock, which applies to > >>> only method 1 accesses; we don't look to be serializing MMCFG > >>> accesses at all. > >>> > >>>> We could alternatively do a per-domain rwlock for vpci and special case > >>>> the MSI-X area to also have a per-device specific lock. At which point > >>>> it becomes fairly similar to what you propose. > >> @Jan, @Roger > >> > >> 1. d->vpci_lock - rwlock <- this protects vpci > >> 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables > >> or should it better be pdev->msix_tbl_lock as MSI-X tables don't > >> really depend on vPCI? > > If so, perhaps indeed better the latter. But as said in reply to Roger, > > I'm not convinced (yet) that doing away with the per-device lock is a > > good move. As said there - we're ourselves doing fully parallel MMCFG > > accesses, so OSes ought to be fine to do so, too. > But with pdev->vpci_lock we face ABBA... I think it would be easier to start with a per-domain rwlock that guarantees pdev->vpci cannot be removed under our feet. This would be taken in read mode in vpci_{read,write} and in write mode when removing a device from a domain. Then there are also other issues regarding vPCI locking that need to be fixed, but that lock would likely be a start. Thanks, Roger.
On 04.02.22 15:06, Roger Pau Monné wrote: > On Fri, Feb 04, 2022 at 12:53:20PM +0000, Oleksandr Andrushchenko wrote: >> >> On 04.02.22 14:47, Jan Beulich wrote: >>> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote: >>>> On 04.02.22 13:37, Jan Beulich wrote: >>>>> On 04.02.2022 12:13, Roger Pau Monné wrote: >>>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: >>>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: >>>>>>>> On 04.02.22 11:15, Jan Beulich wrote: >>>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote: >>>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>>>> continue; >>>>>>>>>>>> } >>>>>>>>>>>> >>>>>>>>>>>> + spin_lock(&tmp->vpci_lock); >>>>>>>>>>>> + if ( !tmp->vpci ) >>>>>>>>>>>> + { >>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>>> + continue; >>>>>>>>>>>> + } >>>>>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>>>>>>>>> { >>>>>>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>>>> rc = rangeset_remove_range(mem, start, end); >>>>>>>>>>>> if ( rc ) >>>>>>>>>>>> { >>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>>>>>>>>> start, end, rc); >>>>>>>>>>>> rangeset_destroy(mem); >>>>>>>>>>>> return rc; >>>>>>>>>>>> } >>>>>>>>>>>> } >>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>>> } >>>>>>>>>>> At the first glance this simply looks like another unjustified (in the >>>>>>>>>>> description) change, as you're not converting anything here but you >>>>>>>>>>> actually add locking (and I realize this was there before, so I'm sorry >>>>>>>>>>> for not pointing this out earlier). >>>>>>>>>> Well, I thought that the description already has "...the lock can be >>>>>>>>>> used (and in a few cases is used right away) to check whether vpci >>>>>>>>>> is present" and this is enough for such uses as here. >>>>>>>>>>> But then I wonder whether you >>>>>>>>>>> actually tested this, since I can't help getting the impression that >>>>>>>>>>> you're introducing a live-lock: The function is called from cmd_write() >>>>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>>>>>>>>> function already holds the lock, and the lock is not (currently) >>>>>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>>>>>>>>> the locking looks to be entirely unnecessary.) >>>>>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire >>>>>>>>>> the lock. But if tmp == pdev and rom_only == true >>>>>>>>>> then we'll deadlock. >>>>>>>>>> >>>>>>>>>> It seems we need to have the locking conditional, e.g. only lock >>>>>>>>>> if tmp != pdev >>>>>>>>> Which will address the live-lock, but introduce ABBA deadlock potential >>>>>>>>> between the two locks. >>>>>>>> I am not sure I can suggest a better solution here >>>>>>>> @Roger, @Jan, could you please help here? >>>>>>> Well, first of all I'd like to mention that while it may have been okay to >>>>>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing >>>>>>> with DomU-s' lists of PCI devices. The requirement really applies to the >>>>>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that >>>>>>> there it probably wants to be a try-lock. >>>>>>> >>>>>>> Next I'd like to point out that here we have the still pending issue of >>>>>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch >>>>>>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution >>>>>>> here, I think it wants to at least account for the extra need there. >>>>>> Yes, sorry, I should take care of that. >>>>>> >>>>>>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding >>>>>>> the deadlock, as it's imo not an option at all to acquire that lock >>>>>>> everywhere else you access ->vpci (or else the vpci lock itself would be >>>>>>> pointless). But a per-domain auxiliary r/w lock may help: Other paths >>>>>>> would acquire it in read mode, and here you'd acquire it in write mode (in >>>>>>> the former case around the vpci lock, while in the latter case there may >>>>>>> then not be any need to acquire the individual vpci locks at all). FTAOD: >>>>>>> I haven't fully thought through all implications (and hence whether this is >>>>>>> viable in the first place); I expect you will, documenting what you've >>>>>>> found in the resulting patch description. Of course the double lock >>>>>>> acquire/release would then likely want hiding in helper functions. >>>>>> I've been also thinking about this, and whether it's really worth to >>>>>> have a per-device lock rather than a per-domain one that protects all >>>>>> vpci regions of the devices assigned to the domain. >>>>>> >>>>>> The OS is likely to serialize accesses to the PCI config space anyway, >>>>>> and the only place I could see a benefit of having per-device locks is >>>>>> in the handling of MSI-X tables, as the handling of the mask bit is >>>>>> likely very performance sensitive, so adding a per-domain lock there >>>>>> could be a bottleneck. >>>>> Hmm, with method 1 accesses serializing globally is basically >>>>> unavoidable, but with MMCFG I see no reason why OSes may not (move >>>>> to) permit(ting) parallel accesses, with serialization perhaps done >>>>> only at device level. See our own pci_config_lock, which applies to >>>>> only method 1 accesses; we don't look to be serializing MMCFG >>>>> accesses at all. >>>>> >>>>>> We could alternatively do a per-domain rwlock for vpci and special case >>>>>> the MSI-X area to also have a per-device specific lock. At which point >>>>>> it becomes fairly similar to what you propose. >>>> @Jan, @Roger >>>> >>>> 1. d->vpci_lock - rwlock <- this protects vpci >>>> 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables >>>> or should it better be pdev->msix_tbl_lock as MSI-X tables don't >>>> really depend on vPCI? >>> If so, perhaps indeed better the latter. But as said in reply to Roger, >>> I'm not convinced (yet) that doing away with the per-device lock is a >>> good move. As said there - we're ourselves doing fully parallel MMCFG >>> accesses, so OSes ought to be fine to do so, too. >> But with pdev->vpci_lock we face ABBA... > I think it would be easier to start with a per-domain rwlock that > guarantees pdev->vpci cannot be removed under our feet. This would be > taken in read mode in vpci_{read,write} and in write mode when > removing a device from a domain. > > Then there are also other issues regarding vPCI locking that need to > be fixed, but that lock would likely be a start. Or let's see the problem at a different angle: this is the only place which breaks the use of pdev->vpci_lock. Because all other places do not try to acquire the lock of any two devices at a time. So, what if we re-work the offending piece of code instead? That way we do not break parallel access and have the lock per-device which might also be a plus. By re-work I mean, that instead of reading already mapped regions from tmp we can employ a d->pci_mapped_regions range set which will hold all the already mapped ranges. And when it is needed to access that range set we use pcidevs_lock which seems to be rare. So, modify_bars will rely on pdev->vpci_lock + pcidevs_lock and ABBA won't be possible at all. > > Thanks, Roger.
On Fri, Feb 04, 2022 at 02:43:07PM +0000, Oleksandr Andrushchenko wrote: > > > On 04.02.22 15:06, Roger Pau Monné wrote: > > On Fri, Feb 04, 2022 at 12:53:20PM +0000, Oleksandr Andrushchenko wrote: > >> > >> On 04.02.22 14:47, Jan Beulich wrote: > >>> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote: > >>>> On 04.02.22 13:37, Jan Beulich wrote: > >>>>> On 04.02.2022 12:13, Roger Pau Monné wrote: > >>>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: > >>>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: > >>>>>>>> On 04.02.22 11:15, Jan Beulich wrote: > >>>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: > >>>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote: > >>>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: > >>>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>>>>>>>>> continue; > >>>>>>>>>>>> } > >>>>>>>>>>>> > >>>>>>>>>>>> + spin_lock(&tmp->vpci_lock); > >>>>>>>>>>>> + if ( !tmp->vpci ) > >>>>>>>>>>>> + { > >>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>>>>>>> + continue; > >>>>>>>>>>>> + } > >>>>>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) > >>>>>>>>>>>> { > >>>>>>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; > >>>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>>>>>>>>> rc = rangeset_remove_range(mem, start, end); > >>>>>>>>>>>> if ( rc ) > >>>>>>>>>>>> { > >>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", > >>>>>>>>>>>> start, end, rc); > >>>>>>>>>>>> rangeset_destroy(mem); > >>>>>>>>>>>> return rc; > >>>>>>>>>>>> } > >>>>>>>>>>>> } > >>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>>>>>>> } > >>>>>>>>>>> At the first glance this simply looks like another unjustified (in the > >>>>>>>>>>> description) change, as you're not converting anything here but you > >>>>>>>>>>> actually add locking (and I realize this was there before, so I'm sorry > >>>>>>>>>>> for not pointing this out earlier). > >>>>>>>>>> Well, I thought that the description already has "...the lock can be > >>>>>>>>>> used (and in a few cases is used right away) to check whether vpci > >>>>>>>>>> is present" and this is enough for such uses as here. > >>>>>>>>>>> But then I wonder whether you > >>>>>>>>>>> actually tested this, since I can't help getting the impression that > >>>>>>>>>>> you're introducing a live-lock: The function is called from cmd_write() > >>>>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that > >>>>>>>>>>> function already holds the lock, and the lock is not (currently) > >>>>>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh > >>>>>>>>>>> the locking looks to be entirely unnecessary.) > >>>>>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire > >>>>>>>>>> the lock. But if tmp == pdev and rom_only == true > >>>>>>>>>> then we'll deadlock. > >>>>>>>>>> > >>>>>>>>>> It seems we need to have the locking conditional, e.g. only lock > >>>>>>>>>> if tmp != pdev > >>>>>>>>> Which will address the live-lock, but introduce ABBA deadlock potential > >>>>>>>>> between the two locks. > >>>>>>>> I am not sure I can suggest a better solution here > >>>>>>>> @Roger, @Jan, could you please help here? > >>>>>>> Well, first of all I'd like to mention that while it may have been okay to > >>>>>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing > >>>>>>> with DomU-s' lists of PCI devices. The requirement really applies to the > >>>>>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that > >>>>>>> there it probably wants to be a try-lock. > >>>>>>> > >>>>>>> Next I'd like to point out that here we have the still pending issue of > >>>>>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch > >>>>>>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution > >>>>>>> here, I think it wants to at least account for the extra need there. > >>>>>> Yes, sorry, I should take care of that. > >>>>>> > >>>>>>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding > >>>>>>> the deadlock, as it's imo not an option at all to acquire that lock > >>>>>>> everywhere else you access ->vpci (or else the vpci lock itself would be > >>>>>>> pointless). But a per-domain auxiliary r/w lock may help: Other paths > >>>>>>> would acquire it in read mode, and here you'd acquire it in write mode (in > >>>>>>> the former case around the vpci lock, while in the latter case there may > >>>>>>> then not be any need to acquire the individual vpci locks at all). FTAOD: > >>>>>>> I haven't fully thought through all implications (and hence whether this is > >>>>>>> viable in the first place); I expect you will, documenting what you've > >>>>>>> found in the resulting patch description. Of course the double lock > >>>>>>> acquire/release would then likely want hiding in helper functions. > >>>>>> I've been also thinking about this, and whether it's really worth to > >>>>>> have a per-device lock rather than a per-domain one that protects all > >>>>>> vpci regions of the devices assigned to the domain. > >>>>>> > >>>>>> The OS is likely to serialize accesses to the PCI config space anyway, > >>>>>> and the only place I could see a benefit of having per-device locks is > >>>>>> in the handling of MSI-X tables, as the handling of the mask bit is > >>>>>> likely very performance sensitive, so adding a per-domain lock there > >>>>>> could be a bottleneck. > >>>>> Hmm, with method 1 accesses serializing globally is basically > >>>>> unavoidable, but with MMCFG I see no reason why OSes may not (move > >>>>> to) permit(ting) parallel accesses, with serialization perhaps done > >>>>> only at device level. See our own pci_config_lock, which applies to > >>>>> only method 1 accesses; we don't look to be serializing MMCFG > >>>>> accesses at all. > >>>>> > >>>>>> We could alternatively do a per-domain rwlock for vpci and special case > >>>>>> the MSI-X area to also have a per-device specific lock. At which point > >>>>>> it becomes fairly similar to what you propose. > >>>> @Jan, @Roger > >>>> > >>>> 1. d->vpci_lock - rwlock <- this protects vpci > >>>> 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables > >>>> or should it better be pdev->msix_tbl_lock as MSI-X tables don't > >>>> really depend on vPCI? > >>> If so, perhaps indeed better the latter. But as said in reply to Roger, > >>> I'm not convinced (yet) that doing away with the per-device lock is a > >>> good move. As said there - we're ourselves doing fully parallel MMCFG > >>> accesses, so OSes ought to be fine to do so, too. > >> But with pdev->vpci_lock we face ABBA... > > I think it would be easier to start with a per-domain rwlock that > > guarantees pdev->vpci cannot be removed under our feet. This would be > > taken in read mode in vpci_{read,write} and in write mode when > > removing a device from a domain. > > > > Then there are also other issues regarding vPCI locking that need to > > be fixed, but that lock would likely be a start. > Or let's see the problem at a different angle: this is the only place > which breaks the use of pdev->vpci_lock. Because all other places > do not try to acquire the lock of any two devices at a time. > So, what if we re-work the offending piece of code instead? > That way we do not break parallel access and have the lock per-device > which might also be a plus. > > By re-work I mean, that instead of reading already mapped regions > from tmp we can employ a d->pci_mapped_regions range set which > will hold all the already mapped ranges. And when it is needed to access > that range set we use pcidevs_lock which seems to be rare. > So, modify_bars will rely on pdev->vpci_lock + pcidevs_lock and > ABBA won't be possible at all. Sadly that won't replace the usage of the loop in modify_bars. This is not (exclusively) done in order to prevent mapping the same region multiple times, but rather to prevent unmapping of regions as long as there's an enabled BAR that's using it. If you wanted to use something like d->pci_mapped_regions it would have to keep reference counts to regions, in order to know when a mapping is no longer required by any BAR on the system with memory decoding enabled. Thanks, Roger.
Hello, On 04.02.22 16:57, Roger Pau Monné wrote: > On Fri, Feb 04, 2022 at 02:43:07PM +0000, Oleksandr Andrushchenko wrote: >> >> On 04.02.22 15:06, Roger Pau Monné wrote: >>> On Fri, Feb 04, 2022 at 12:53:20PM +0000, Oleksandr Andrushchenko wrote: >>>> On 04.02.22 14:47, Jan Beulich wrote: >>>>> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote: >>>>>> On 04.02.22 13:37, Jan Beulich wrote: >>>>>>> On 04.02.2022 12:13, Roger Pau Monné wrote: >>>>>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: >>>>>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: >>>>>>>>>> On 04.02.22 11:15, Jan Beulich wrote: >>>>>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>>>>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote: >>>>>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>>>>>> continue; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> >>>>>>>>>>>>>> + spin_lock(&tmp->vpci_lock); >>>>>>>>>>>>>> + if ( !tmp->vpci ) >>>>>>>>>>>>>> + { >>>>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>>>>> + continue; >>>>>>>>>>>>>> + } >>>>>>>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>>>>>> rc = rangeset_remove_range(mem, start, end); >>>>>>>>>>>>>> if ( rc ) >>>>>>>>>>>>>> { >>>>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>>>>>>>>>>> start, end, rc); >>>>>>>>>>>>>> rangeset_destroy(mem); >>>>>>>>>>>>>> return rc; >>>>>>>>>>>>>> } >>>>>>>>>>>>>> } >>>>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>>>>> } >>>>>>>>>>>>> At the first glance this simply looks like another unjustified (in the >>>>>>>>>>>>> description) change, as you're not converting anything here but you >>>>>>>>>>>>> actually add locking (and I realize this was there before, so I'm sorry >>>>>>>>>>>>> for not pointing this out earlier). >>>>>>>>>>>> Well, I thought that the description already has "...the lock can be >>>>>>>>>>>> used (and in a few cases is used right away) to check whether vpci >>>>>>>>>>>> is present" and this is enough for such uses as here. >>>>>>>>>>>>> But then I wonder whether you >>>>>>>>>>>>> actually tested this, since I can't help getting the impression that >>>>>>>>>>>>> you're introducing a live-lock: The function is called from cmd_write() >>>>>>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>>>>>>>>>>> function already holds the lock, and the lock is not (currently) >>>>>>>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>>>>>>>>>>> the locking looks to be entirely unnecessary.) >>>>>>>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire >>>>>>>>>>>> the lock. But if tmp == pdev and rom_only == true >>>>>>>>>>>> then we'll deadlock. >>>>>>>>>>>> >>>>>>>>>>>> It seems we need to have the locking conditional, e.g. only lock >>>>>>>>>>>> if tmp != pdev >>>>>>>>>>> Which will address the live-lock, but introduce ABBA deadlock potential >>>>>>>>>>> between the two locks. >>>>>>>>>> I am not sure I can suggest a better solution here >>>>>>>>>> @Roger, @Jan, could you please help here? >>>>>>>>> Well, first of all I'd like to mention that while it may have been okay to >>>>>>>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing >>>>>>>>> with DomU-s' lists of PCI devices. The requirement really applies to the >>>>>>>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that >>>>>>>>> there it probably wants to be a try-lock. >>>>>>>>> >>>>>>>>> Next I'd like to point out that here we have the still pending issue of >>>>>>>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch >>>>>>>>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution >>>>>>>>> here, I think it wants to at least account for the extra need there. >>>>>>>> Yes, sorry, I should take care of that. >>>>>>>> >>>>>>>>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding >>>>>>>>> the deadlock, as it's imo not an option at all to acquire that lock >>>>>>>>> everywhere else you access ->vpci (or else the vpci lock itself would be >>>>>>>>> pointless). But a per-domain auxiliary r/w lock may help: Other paths >>>>>>>>> would acquire it in read mode, and here you'd acquire it in write mode (in >>>>>>>>> the former case around the vpci lock, while in the latter case there may >>>>>>>>> then not be any need to acquire the individual vpci locks at all). FTAOD: >>>>>>>>> I haven't fully thought through all implications (and hence whether this is >>>>>>>>> viable in the first place); I expect you will, documenting what you've >>>>>>>>> found in the resulting patch description. Of course the double lock >>>>>>>>> acquire/release would then likely want hiding in helper functions. >>>>>>>> I've been also thinking about this, and whether it's really worth to >>>>>>>> have a per-device lock rather than a per-domain one that protects all >>>>>>>> vpci regions of the devices assigned to the domain. >>>>>>>> >>>>>>>> The OS is likely to serialize accesses to the PCI config space anyway, >>>>>>>> and the only place I could see a benefit of having per-device locks is >>>>>>>> in the handling of MSI-X tables, as the handling of the mask bit is >>>>>>>> likely very performance sensitive, so adding a per-domain lock there >>>>>>>> could be a bottleneck. >>>>>>> Hmm, with method 1 accesses serializing globally is basically >>>>>>> unavoidable, but with MMCFG I see no reason why OSes may not (move >>>>>>> to) permit(ting) parallel accesses, with serialization perhaps done >>>>>>> only at device level. See our own pci_config_lock, which applies to >>>>>>> only method 1 accesses; we don't look to be serializing MMCFG >>>>>>> accesses at all. >>>>>>> >>>>>>>> We could alternatively do a per-domain rwlock for vpci and special case >>>>>>>> the MSI-X area to also have a per-device specific lock. At which point >>>>>>>> it becomes fairly similar to what you propose. >>>>>> @Jan, @Roger >>>>>> >>>>>> 1. d->vpci_lock - rwlock <- this protects vpci >>>>>> 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables >>>>>> or should it better be pdev->msix_tbl_lock as MSI-X tables don't >>>>>> really depend on vPCI? >>>>> If so, perhaps indeed better the latter. But as said in reply to Roger, >>>>> I'm not convinced (yet) that doing away with the per-device lock is a >>>>> good move. As said there - we're ourselves doing fully parallel MMCFG >>>>> accesses, so OSes ought to be fine to do so, too. >>>> But with pdev->vpci_lock we face ABBA... >>> I think it would be easier to start with a per-domain rwlock that >>> guarantees pdev->vpci cannot be removed under our feet. This would be >>> taken in read mode in vpci_{read,write} and in write mode when >>> removing a device from a domain. >>> >>> Then there are also other issues regarding vPCI locking that need to >>> be fixed, but that lock would likely be a start. >> Or let's see the problem at a different angle: this is the only place >> which breaks the use of pdev->vpci_lock. Because all other places >> do not try to acquire the lock of any two devices at a time. >> So, what if we re-work the offending piece of code instead? >> That way we do not break parallel access and have the lock per-device >> which might also be a plus. >> >> By re-work I mean, that instead of reading already mapped regions >> from tmp we can employ a d->pci_mapped_regions range set which >> will hold all the already mapped ranges. And when it is needed to access >> that range set we use pcidevs_lock which seems to be rare. >> So, modify_bars will rely on pdev->vpci_lock + pcidevs_lock and >> ABBA won't be possible at all. > Sadly that won't replace the usage of the loop in modify_bars. This is > not (exclusively) done in order to prevent mapping the same region > multiple times, but rather to prevent unmapping of regions as long as > there's an enabled BAR that's using it. > > If you wanted to use something like d->pci_mapped_regions it would > have to keep reference counts to regions, in order to know when a > mapping is no longer required by any BAR on the system with memory > decoding enabled. I missed this path, thank you I tried to analyze the locking in pci/vpci. First of all some context to refresh the target we want: the rationale behind moving pdev->vpci->lock outside is to be able dynamically create and destroy pdev->vpci. So, for that reason lock needs to be moved outside of the pdev->vpci. Some of the callers of the vPCI code and locking used: ====================================== vpci_mmio_read/vpci_mmcfg_read ====================================== - vpci_ecam_read - vpci_read !!!!!!!! pdev is acquired, then pdev->vpci_lock is used !!!!!!!! - msix: - control_read - header: - guest_bar_read - msi: - control_read - address_read/address_hi_read - data_read - mask_read ====================================== vpci_mmio_write/vpci_mmcfg_write ====================================== - vpci_ecam_write - vpci_write !!!!!!!! pdev is acquired, then pdev->vpci_lock is used !!!!!!!! - msix: - control_write - header: - bar_write/guest_bar_write - cmd_write/guest_cmd_write - rom_write - all write handlers may call modify_bars modify_bars - msi: - control_write - address_write/address_hi_write - data_write - mask_write ====================================== pci_add_device: locked with pcidevs_lock ====================================== - vpci_add_handlers ++++++++ pdev->vpci_lock is used ++++++++ ====================================== pci_remove_device: locked with pcidevs_lock ====================================== - vpci_remove_device ++++++++ pdev->vpci_lock is used ++++++++ - pci_cleanup_msi - free_pdev ====================================== XEN_DOMCTL_assign_device: locked with pcidevs_lock ====================================== - assign_device - vpci_deassign_device - pdev_msix_assign - vpci_assign_device - vpci_add_handlers ++++++++ pdev->vpci_lock is used ++++++++ ====================================== XEN_DOMCTL_deassign_device: locked with pcidevs_lock ====================================== - deassign_device - vpci_deassign_device ++++++++ pdev->vpci_lock is used ++++++++ - vpci_remove_device ====================================== modify_bars is a special case: this is the only function which tries to lock two pci_dev devices: it is done to check for overlaps with other BARs which may have been already mapped or unmapped. So, this is the only case which may deadlock because of pci_dev->vpci_lock. ====================================== Bottom line: ====================================== 1. vpci_{read|write} are not protected with pcidevs_lock and can run in parallel with pci_remove_device which can remove pdev after vpci_{read|write} acquired the pdev pointer. This may lead to a fail due to pdev dereference. So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock. 2. The only offending place which is in the way of pci_dev->vpci_lock is modify_bars. If it can be re-worked to track already mapped and unmapped regions then we can avoid having a possible deadlock and can use pci_dev->vpci_lock (rangesets won't help here as we also need refcounting be implemented). If pcidevs_lock is used for vpci_{read|write} then no deadlock is possible, but modify_bars code must be re-worked not to lock itself (pdev->vpci_lock and tmp->vpci_lock when pdev == tmp, this is minor). 3. We may think about a per-domain rwlock and pdev->vpci_lock, so this solves modify_bars's two pdevs access. But this doesn't solve possible pdev de-reference in vpci_{read|write} vs pci_remove_device. @Roger, @Jan, I would like to hear what do you think about the above analysis and how can we proceed with locking re-work? Thank you in advance, Oleksandr
On 07.02.2022 12:08, Oleksandr Andrushchenko wrote: > 1. vpci_{read|write} are not protected with pcidevs_lock and can run in > parallel with pci_remove_device which can remove pdev after vpci_{read|write} > acquired the pdev pointer. This may lead to a fail due to pdev dereference. > > So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock. I think this is not the only place where there is a theoretical race against pci_remove_device(). I would recommend to separate the overall situation with pcidevs_lock from the issue here. I don't view it as an option to acquire pcidevs_lock in vpci_{read,write}(). If anything, we need proper refcounting of PCI devices (at which point likely a number of lock uses can go away). Jan
On Mon, Feb 07, 2022 at 11:08:39AM +0000, Oleksandr Andrushchenko wrote: > Hello, > > On 04.02.22 16:57, Roger Pau Monné wrote: > > On Fri, Feb 04, 2022 at 02:43:07PM +0000, Oleksandr Andrushchenko wrote: > >> > >> On 04.02.22 15:06, Roger Pau Monné wrote: > >>> On Fri, Feb 04, 2022 at 12:53:20PM +0000, Oleksandr Andrushchenko wrote: > >>>> On 04.02.22 14:47, Jan Beulich wrote: > >>>>> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote: > >>>>>> On 04.02.22 13:37, Jan Beulich wrote: > >>>>>>> On 04.02.2022 12:13, Roger Pau Monné wrote: > >>>>>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: > >>>>>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: > >>>>>>>>>> On 04.02.22 11:15, Jan Beulich wrote: > >>>>>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: > >>>>>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote: > >>>>>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: > >>>>>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>>>>>>>>>>> continue; > >>>>>>>>>>>>>> } > >>>>>>>>>>>>>> > >>>>>>>>>>>>>> + spin_lock(&tmp->vpci_lock); > >>>>>>>>>>>>>> + if ( !tmp->vpci ) > >>>>>>>>>>>>>> + { > >>>>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>>>>>>>>> + continue; > >>>>>>>>>>>>>> + } > >>>>>>>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) > >>>>>>>>>>>>>> { > >>>>>>>>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; > >>>>>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) > >>>>>>>>>>>>>> rc = rangeset_remove_range(mem, start, end); > >>>>>>>>>>>>>> if ( rc ) > >>>>>>>>>>>>>> { > >>>>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", > >>>>>>>>>>>>>> start, end, rc); > >>>>>>>>>>>>>> rangeset_destroy(mem); > >>>>>>>>>>>>>> return rc; > >>>>>>>>>>>>>> } > >>>>>>>>>>>>>> } > >>>>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); > >>>>>>>>>>>>>> } > >>>>>>>>>>>>> At the first glance this simply looks like another unjustified (in the > >>>>>>>>>>>>> description) change, as you're not converting anything here but you > >>>>>>>>>>>>> actually add locking (and I realize this was there before, so I'm sorry > >>>>>>>>>>>>> for not pointing this out earlier). > >>>>>>>>>>>> Well, I thought that the description already has "...the lock can be > >>>>>>>>>>>> used (and in a few cases is used right away) to check whether vpci > >>>>>>>>>>>> is present" and this is enough for such uses as here. > >>>>>>>>>>>>> But then I wonder whether you > >>>>>>>>>>>>> actually tested this, since I can't help getting the impression that > >>>>>>>>>>>>> you're introducing a live-lock: The function is called from cmd_write() > >>>>>>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that > >>>>>>>>>>>>> function already holds the lock, and the lock is not (currently) > >>>>>>>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh > >>>>>>>>>>>>> the locking looks to be entirely unnecessary.) > >>>>>>>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire > >>>>>>>>>>>> the lock. But if tmp == pdev and rom_only == true > >>>>>>>>>>>> then we'll deadlock. > >>>>>>>>>>>> > >>>>>>>>>>>> It seems we need to have the locking conditional, e.g. only lock > >>>>>>>>>>>> if tmp != pdev > >>>>>>>>>>> Which will address the live-lock, but introduce ABBA deadlock potential > >>>>>>>>>>> between the two locks. > >>>>>>>>>> I am not sure I can suggest a better solution here > >>>>>>>>>> @Roger, @Jan, could you please help here? > >>>>>>>>> Well, first of all I'd like to mention that while it may have been okay to > >>>>>>>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing > >>>>>>>>> with DomU-s' lists of PCI devices. The requirement really applies to the > >>>>>>>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that > >>>>>>>>> there it probably wants to be a try-lock. > >>>>>>>>> > >>>>>>>>> Next I'd like to point out that here we have the still pending issue of > >>>>>>>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch > >>>>>>>>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution > >>>>>>>>> here, I think it wants to at least account for the extra need there. > >>>>>>>> Yes, sorry, I should take care of that. > >>>>>>>> > >>>>>>>>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding > >>>>>>>>> the deadlock, as it's imo not an option at all to acquire that lock > >>>>>>>>> everywhere else you access ->vpci (or else the vpci lock itself would be > >>>>>>>>> pointless). But a per-domain auxiliary r/w lock may help: Other paths > >>>>>>>>> would acquire it in read mode, and here you'd acquire it in write mode (in > >>>>>>>>> the former case around the vpci lock, while in the latter case there may > >>>>>>>>> then not be any need to acquire the individual vpci locks at all). FTAOD: > >>>>>>>>> I haven't fully thought through all implications (and hence whether this is > >>>>>>>>> viable in the first place); I expect you will, documenting what you've > >>>>>>>>> found in the resulting patch description. Of course the double lock > >>>>>>>>> acquire/release would then likely want hiding in helper functions. > >>>>>>>> I've been also thinking about this, and whether it's really worth to > >>>>>>>> have a per-device lock rather than a per-domain one that protects all > >>>>>>>> vpci regions of the devices assigned to the domain. > >>>>>>>> > >>>>>>>> The OS is likely to serialize accesses to the PCI config space anyway, > >>>>>>>> and the only place I could see a benefit of having per-device locks is > >>>>>>>> in the handling of MSI-X tables, as the handling of the mask bit is > >>>>>>>> likely very performance sensitive, so adding a per-domain lock there > >>>>>>>> could be a bottleneck. > >>>>>>> Hmm, with method 1 accesses serializing globally is basically > >>>>>>> unavoidable, but with MMCFG I see no reason why OSes may not (move > >>>>>>> to) permit(ting) parallel accesses, with serialization perhaps done > >>>>>>> only at device level. See our own pci_config_lock, which applies to > >>>>>>> only method 1 accesses; we don't look to be serializing MMCFG > >>>>>>> accesses at all. > >>>>>>> > >>>>>>>> We could alternatively do a per-domain rwlock for vpci and special case > >>>>>>>> the MSI-X area to also have a per-device specific lock. At which point > >>>>>>>> it becomes fairly similar to what you propose. > >>>>>> @Jan, @Roger > >>>>>> > >>>>>> 1. d->vpci_lock - rwlock <- this protects vpci > >>>>>> 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables > >>>>>> or should it better be pdev->msix_tbl_lock as MSI-X tables don't > >>>>>> really depend on vPCI? > >>>>> If so, perhaps indeed better the latter. But as said in reply to Roger, > >>>>> I'm not convinced (yet) that doing away with the per-device lock is a > >>>>> good move. As said there - we're ourselves doing fully parallel MMCFG > >>>>> accesses, so OSes ought to be fine to do so, too. > >>>> But with pdev->vpci_lock we face ABBA... > >>> I think it would be easier to start with a per-domain rwlock that > >>> guarantees pdev->vpci cannot be removed under our feet. This would be > >>> taken in read mode in vpci_{read,write} and in write mode when > >>> removing a device from a domain. > >>> > >>> Then there are also other issues regarding vPCI locking that need to > >>> be fixed, but that lock would likely be a start. > >> Or let's see the problem at a different angle: this is the only place > >> which breaks the use of pdev->vpci_lock. Because all other places > >> do not try to acquire the lock of any two devices at a time. > >> So, what if we re-work the offending piece of code instead? > >> That way we do not break parallel access and have the lock per-device > >> which might also be a plus. > >> > >> By re-work I mean, that instead of reading already mapped regions > >> from tmp we can employ a d->pci_mapped_regions range set which > >> will hold all the already mapped ranges. And when it is needed to access > >> that range set we use pcidevs_lock which seems to be rare. > >> So, modify_bars will rely on pdev->vpci_lock + pcidevs_lock and > >> ABBA won't be possible at all. > > Sadly that won't replace the usage of the loop in modify_bars. This is > > not (exclusively) done in order to prevent mapping the same region > > multiple times, but rather to prevent unmapping of regions as long as > > there's an enabled BAR that's using it. > > > > If you wanted to use something like d->pci_mapped_regions it would > > have to keep reference counts to regions, in order to know when a > > mapping is no longer required by any BAR on the system with memory > > decoding enabled. > I missed this path, thank you > > I tried to analyze the locking in pci/vpci. > > First of all some context to refresh the target we want: > the rationale behind moving pdev->vpci->lock outside > is to be able dynamically create and destroy pdev->vpci. > So, for that reason lock needs to be moved outside of the pdev->vpci. > > Some of the callers of the vPCI code and locking used: > > ====================================== > vpci_mmio_read/vpci_mmcfg_read > ====================================== > - vpci_ecam_read > - vpci_read > !!!!!!!! pdev is acquired, then pdev->vpci_lock is used !!!!!!!! > - msix: > - control_read > - header: > - guest_bar_read > - msi: > - control_read > - address_read/address_hi_read > - data_read > - mask_read > > ====================================== > vpci_mmio_write/vpci_mmcfg_write > ====================================== > - vpci_ecam_write > - vpci_write > !!!!!!!! pdev is acquired, then pdev->vpci_lock is used !!!!!!!! > - msix: > - control_write > - header: > - bar_write/guest_bar_write > - cmd_write/guest_cmd_write > - rom_write > - all write handlers may call modify_bars > modify_bars > - msi: > - control_write > - address_write/address_hi_write > - data_write > - mask_write > > ====================================== > pci_add_device: locked with pcidevs_lock > ====================================== > - vpci_add_handlers > ++++++++ pdev->vpci_lock is used ++++++++ > > ====================================== > pci_remove_device: locked with pcidevs_lock > ====================================== > - vpci_remove_device > ++++++++ pdev->vpci_lock is used ++++++++ > - pci_cleanup_msi > - free_pdev > > ====================================== > XEN_DOMCTL_assign_device: locked with pcidevs_lock > ====================================== > - assign_device > - vpci_deassign_device > - pdev_msix_assign > - vpci_assign_device > - vpci_add_handlers > ++++++++ pdev->vpci_lock is used ++++++++ > > ====================================== > XEN_DOMCTL_deassign_device: locked with pcidevs_lock > ====================================== > - deassign_device > - vpci_deassign_device > ++++++++ pdev->vpci_lock is used ++++++++ > - vpci_remove_device > > > ====================================== > modify_bars is a special case: this is the only function which tries to lock > two pci_dev devices: it is done to check for overlaps with other BARs which may have been > already mapped or unmapped. > > So, this is the only case which may deadlock because of pci_dev->vpci_lock. > ====================================== > > Bottom line: > ====================================== > > 1. vpci_{read|write} are not protected with pcidevs_lock and can run in > parallel with pci_remove_device which can remove pdev after vpci_{read|write} > acquired the pdev pointer. This may lead to a fail due to pdev dereference. > > So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock. We would like to take the pcidevs_lock only while fetching the device (ie: pci_get_pdev_by_domain), afterwards it should be fine to lock the device using a vpci specific lock so calls to vpci_{read,write} can be partially concurrent across multiple domains. In fact I think Jan had already pointed out that the pci lock would need taking while searching for the device in vpci_{read,write}. It seems to me that if you implement option 3 below taking the per-domain rwlock in read mode in vpci_{read|write} will already protect you from the device being removed if the same per-domain lock is taken in write mode in vpci_remove_device. > 2. The only offending place which is in the way of pci_dev->vpci_lock is > modify_bars. If it can be re-worked to track already mapped and unmapped > regions then we can avoid having a possible deadlock and can use > pci_dev->vpci_lock (rangesets won't help here as we also need refcounting be > implemented). I think a refcounting based solution will be very complex to implement. I'm however happy to be proven wrong. > If pcidevs_lock is used for vpci_{read|write} then no deadlock is possible, > but modify_bars code must be re-worked not to lock itself (pdev->vpci_lock and > tmp->vpci_lock when pdev == tmp, this is minor). Taking the pcidevs lock (a global lock) is out of the picture IMO, as it's going to serialize all calls of vpci_{read|write}, and would create too much contention on the pcidevs lock. > 3. We may think about a per-domain rwlock and pdev->vpci_lock, so this solves > modify_bars's two pdevs access. But this doesn't solve possible pdev > de-reference in vpci_{read|write} vs pci_remove_device. pci_remove device will call vpci_remove_device, so as long as vpci_remove_device taken the per-domain lock in write (exclusive) mode it should be fine. > @Roger, @Jan, I would like to hear what do you think about the above analysis > and how can we proceed with locking re-work? I think the per-domain rwlock seems like a good option. I would do that as a pre-patch. Thanks, Roger.
On 07.02.22 14:34, Jan Beulich wrote: > On 07.02.2022 12:08, Oleksandr Andrushchenko wrote: >> 1. vpci_{read|write} are not protected with pcidevs_lock and can run in >> parallel with pci_remove_device which can remove pdev after vpci_{read|write} >> acquired the pdev pointer. This may lead to a fail due to pdev dereference. >> >> So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock. > I think this is not the only place where there is a theoretical race > against pci_remove_device(). Not at all, that was just to demonstrate one of the possible sources of races. > I would recommend to separate the > overall situation with pcidevs_lock from the issue here. Do you agree that there is already an issue with that? In the currently existing code? > I don't view > it as an option to acquire pcidevs_lock in vpci_{read,write}(). Yes, that would hurt too much, I agree. But this needs to be solved > If > anything, we need proper refcounting of PCI devices (at which point > likely a number of lock uses can go away). It seems so. Then not only pdev's need refcounting, but pdev->vpci as well What's your view on how can we achieve both goals? pdev and pdev->vpci and locking/refcounting This is really crucial for all the code for PCI passthrough on Arm because without this ground work done we can't accept all the patches which rely on this: vPCI changes, MSI/MSI-X etc. > > Jan > Thank you, Oleksandr
On 07.02.2022 13:57, Oleksandr Andrushchenko wrote: > > > On 07.02.22 14:34, Jan Beulich wrote: >> On 07.02.2022 12:08, Oleksandr Andrushchenko wrote: >>> 1. vpci_{read|write} are not protected with pcidevs_lock and can run in >>> parallel with pci_remove_device which can remove pdev after vpci_{read|write} >>> acquired the pdev pointer. This may lead to a fail due to pdev dereference. >>> >>> So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock. >> I think this is not the only place where there is a theoretical race >> against pci_remove_device(). > Not at all, that was just to demonstrate one of the possible sources of races. >> I would recommend to separate the >> overall situation with pcidevs_lock from the issue here. > Do you agree that there is already an issue with that? In the currently existing code? >> I don't view >> it as an option to acquire pcidevs_lock in vpci_{read,write}(). > Yes, that would hurt too much, I agree. But this needs to be solved >> If >> anything, we need proper refcounting of PCI devices (at which point >> likely a number of lock uses can go away). > It seems so. Then not only pdev's need refcounting, but pdev->vpci as well > > What's your view on how can we achieve both goals? > pdev and pdev->vpci and locking/refcounting I don't see why pdev->vpci might need refcounting. And just to state it in different words: I'd like to suggest to leave aside the pdev locking as long as it's _just_ to protect against hot remove of a device. That's orthogonal to what you need for vPCI, where you need to protect against the device disappearing from a guest (without at the same time disappearing from the host). Jan
On 07.02.22 14:46, Roger Pau Monné wrote: > On Mon, Feb 07, 2022 at 11:08:39AM +0000, Oleksandr Andrushchenko wrote: >> Hello, >> >> On 04.02.22 16:57, Roger Pau Monné wrote: >>> On Fri, Feb 04, 2022 at 02:43:07PM +0000, Oleksandr Andrushchenko wrote: >>>> On 04.02.22 15:06, Roger Pau Monné wrote: >>>>> On Fri, Feb 04, 2022 at 12:53:20PM +0000, Oleksandr Andrushchenko wrote: >>>>>> On 04.02.22 14:47, Jan Beulich wrote: >>>>>>> On 04.02.2022 13:37, Oleksandr Andrushchenko wrote: >>>>>>>> On 04.02.22 13:37, Jan Beulich wrote: >>>>>>>>> On 04.02.2022 12:13, Roger Pau Monné wrote: >>>>>>>>>> On Fri, Feb 04, 2022 at 11:49:18AM +0100, Jan Beulich wrote: >>>>>>>>>>> On 04.02.2022 11:12, Oleksandr Andrushchenko wrote: >>>>>>>>>>>> On 04.02.22 11:15, Jan Beulich wrote: >>>>>>>>>>>>> On 04.02.2022 09:58, Oleksandr Andrushchenko wrote: >>>>>>>>>>>>>> On 04.02.22 09:52, Jan Beulich wrote: >>>>>>>>>>>>>>> On 04.02.2022 07:34, Oleksandr Andrushchenko wrote: >>>>>>>>>>>>>>>> @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>>>>>>>> continue; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>> + spin_lock(&tmp->vpci_lock); >>>>>>>>>>>>>>>> + if ( !tmp->vpci ) >>>>>>>>>>>>>>>> + { >>>>>>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>>>>>>> + continue; >>>>>>>>>>>>>>>> + } >>>>>>>>>>>>>>>> for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; >>>>>>>>>>>>>>>> @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) >>>>>>>>>>>>>>>> rc = rangeset_remove_range(mem, start, end); >>>>>>>>>>>>>>>> if ( rc ) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>>>>>>> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", >>>>>>>>>>>>>>>> start, end, rc); >>>>>>>>>>>>>>>> rangeset_destroy(mem); >>>>>>>>>>>>>>>> return rc; >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>>> + spin_unlock(&tmp->vpci_lock); >>>>>>>>>>>>>>>> } >>>>>>>>>>>>>>> At the first glance this simply looks like another unjustified (in the >>>>>>>>>>>>>>> description) change, as you're not converting anything here but you >>>>>>>>>>>>>>> actually add locking (and I realize this was there before, so I'm sorry >>>>>>>>>>>>>>> for not pointing this out earlier). >>>>>>>>>>>>>> Well, I thought that the description already has "...the lock can be >>>>>>>>>>>>>> used (and in a few cases is used right away) to check whether vpci >>>>>>>>>>>>>> is present" and this is enough for such uses as here. >>>>>>>>>>>>>>> But then I wonder whether you >>>>>>>>>>>>>>> actually tested this, since I can't help getting the impression that >>>>>>>>>>>>>>> you're introducing a live-lock: The function is called from cmd_write() >>>>>>>>>>>>>>> and rom_write(), which in turn are called out of vpci_write(). Yet that >>>>>>>>>>>>>>> function already holds the lock, and the lock is not (currently) >>>>>>>>>>>>>>> recursive. (For the 3rd caller of the function - init_bars() - otoh >>>>>>>>>>>>>>> the locking looks to be entirely unnecessary.) >>>>>>>>>>>>>> Well, you are correct: if tmp != pdev then it is correct to acquire >>>>>>>>>>>>>> the lock. But if tmp == pdev and rom_only == true >>>>>>>>>>>>>> then we'll deadlock. >>>>>>>>>>>>>> >>>>>>>>>>>>>> It seems we need to have the locking conditional, e.g. only lock >>>>>>>>>>>>>> if tmp != pdev >>>>>>>>>>>>> Which will address the live-lock, but introduce ABBA deadlock potential >>>>>>>>>>>>> between the two locks. >>>>>>>>>>>> I am not sure I can suggest a better solution here >>>>>>>>>>>> @Roger, @Jan, could you please help here? >>>>>>>>>>> Well, first of all I'd like to mention that while it may have been okay to >>>>>>>>>>> not hold pcidevs_lock here for Dom0, it surely needs acquiring when dealing >>>>>>>>>>> with DomU-s' lists of PCI devices. The requirement really applies to the >>>>>>>>>>> other use of for_each_pdev() as well (in vpci_dump_msi()), except that >>>>>>>>>>> there it probably wants to be a try-lock. >>>>>>>>>>> >>>>>>>>>>> Next I'd like to point out that here we have the still pending issue of >>>>>>>>>>> how to deal with hidden devices, which Dom0 can access. See my RFC patch >>>>>>>>>>> "vPCI: account for hidden devices in modify_bars()". Whatever the solution >>>>>>>>>>> here, I think it wants to at least account for the extra need there. >>>>>>>>>> Yes, sorry, I should take care of that. >>>>>>>>>> >>>>>>>>>>> Now it is quite clear that pcidevs_lock isn't going to help with avoiding >>>>>>>>>>> the deadlock, as it's imo not an option at all to acquire that lock >>>>>>>>>>> everywhere else you access ->vpci (or else the vpci lock itself would be >>>>>>>>>>> pointless). But a per-domain auxiliary r/w lock may help: Other paths >>>>>>>>>>> would acquire it in read mode, and here you'd acquire it in write mode (in >>>>>>>>>>> the former case around the vpci lock, while in the latter case there may >>>>>>>>>>> then not be any need to acquire the individual vpci locks at all). FTAOD: >>>>>>>>>>> I haven't fully thought through all implications (and hence whether this is >>>>>>>>>>> viable in the first place); I expect you will, documenting what you've >>>>>>>>>>> found in the resulting patch description. Of course the double lock >>>>>>>>>>> acquire/release would then likely want hiding in helper functions. >>>>>>>>>> I've been also thinking about this, and whether it's really worth to >>>>>>>>>> have a per-device lock rather than a per-domain one that protects all >>>>>>>>>> vpci regions of the devices assigned to the domain. >>>>>>>>>> >>>>>>>>>> The OS is likely to serialize accesses to the PCI config space anyway, >>>>>>>>>> and the only place I could see a benefit of having per-device locks is >>>>>>>>>> in the handling of MSI-X tables, as the handling of the mask bit is >>>>>>>>>> likely very performance sensitive, so adding a per-domain lock there >>>>>>>>>> could be a bottleneck. >>>>>>>>> Hmm, with method 1 accesses serializing globally is basically >>>>>>>>> unavoidable, but with MMCFG I see no reason why OSes may not (move >>>>>>>>> to) permit(ting) parallel accesses, with serialization perhaps done >>>>>>>>> only at device level. See our own pci_config_lock, which applies to >>>>>>>>> only method 1 accesses; we don't look to be serializing MMCFG >>>>>>>>> accesses at all. >>>>>>>>> >>>>>>>>>> We could alternatively do a per-domain rwlock for vpci and special case >>>>>>>>>> the MSI-X area to also have a per-device specific lock. At which point >>>>>>>>>> it becomes fairly similar to what you propose. >>>>>>>> @Jan, @Roger >>>>>>>> >>>>>>>> 1. d->vpci_lock - rwlock <- this protects vpci >>>>>>>> 2. pdev->vpci->msix_tbl_lock - rwlock <- this protects MSI-X tables >>>>>>>> or should it better be pdev->msix_tbl_lock as MSI-X tables don't >>>>>>>> really depend on vPCI? >>>>>>> If so, perhaps indeed better the latter. But as said in reply to Roger, >>>>>>> I'm not convinced (yet) that doing away with the per-device lock is a >>>>>>> good move. As said there - we're ourselves doing fully parallel MMCFG >>>>>>> accesses, so OSes ought to be fine to do so, too. >>>>>> But with pdev->vpci_lock we face ABBA... >>>>> I think it would be easier to start with a per-domain rwlock that >>>>> guarantees pdev->vpci cannot be removed under our feet. This would be >>>>> taken in read mode in vpci_{read,write} and in write mode when >>>>> removing a device from a domain. >>>>> >>>>> Then there are also other issues regarding vPCI locking that need to >>>>> be fixed, but that lock would likely be a start. >>>> Or let's see the problem at a different angle: this is the only place >>>> which breaks the use of pdev->vpci_lock. Because all other places >>>> do not try to acquire the lock of any two devices at a time. >>>> So, what if we re-work the offending piece of code instead? >>>> That way we do not break parallel access and have the lock per-device >>>> which might also be a plus. >>>> >>>> By re-work I mean, that instead of reading already mapped regions >>>> from tmp we can employ a d->pci_mapped_regions range set which >>>> will hold all the already mapped ranges. And when it is needed to access >>>> that range set we use pcidevs_lock which seems to be rare. >>>> So, modify_bars will rely on pdev->vpci_lock + pcidevs_lock and >>>> ABBA won't be possible at all. >>> Sadly that won't replace the usage of the loop in modify_bars. This is >>> not (exclusively) done in order to prevent mapping the same region >>> multiple times, but rather to prevent unmapping of regions as long as >>> there's an enabled BAR that's using it. >>> >>> If you wanted to use something like d->pci_mapped_regions it would >>> have to keep reference counts to regions, in order to know when a >>> mapping is no longer required by any BAR on the system with memory >>> decoding enabled. >> I missed this path, thank you >> >> I tried to analyze the locking in pci/vpci. >> >> First of all some context to refresh the target we want: >> the rationale behind moving pdev->vpci->lock outside >> is to be able dynamically create and destroy pdev->vpci. >> So, for that reason lock needs to be moved outside of the pdev->vpci. >> >> Some of the callers of the vPCI code and locking used: >> >> ====================================== >> vpci_mmio_read/vpci_mmcfg_read >> ====================================== >> - vpci_ecam_read >> - vpci_read >> !!!!!!!! pdev is acquired, then pdev->vpci_lock is used !!!!!!!! >> - msix: >> - control_read >> - header: >> - guest_bar_read >> - msi: >> - control_read >> - address_read/address_hi_read >> - data_read >> - mask_read >> >> ====================================== >> vpci_mmio_write/vpci_mmcfg_write >> ====================================== >> - vpci_ecam_write >> - vpci_write >> !!!!!!!! pdev is acquired, then pdev->vpci_lock is used !!!!!!!! >> - msix: >> - control_write >> - header: >> - bar_write/guest_bar_write >> - cmd_write/guest_cmd_write >> - rom_write >> - all write handlers may call modify_bars >> modify_bars >> - msi: >> - control_write >> - address_write/address_hi_write >> - data_write >> - mask_write >> >> ====================================== >> pci_add_device: locked with pcidevs_lock >> ====================================== >> - vpci_add_handlers >> ++++++++ pdev->vpci_lock is used ++++++++ >> >> ====================================== >> pci_remove_device: locked with pcidevs_lock >> ====================================== >> - vpci_remove_device >> ++++++++ pdev->vpci_lock is used ++++++++ >> - pci_cleanup_msi >> - free_pdev >> >> ====================================== >> XEN_DOMCTL_assign_device: locked with pcidevs_lock >> ====================================== >> - assign_device >> - vpci_deassign_device >> - pdev_msix_assign >> - vpci_assign_device >> - vpci_add_handlers >> ++++++++ pdev->vpci_lock is used ++++++++ >> >> ====================================== >> XEN_DOMCTL_deassign_device: locked with pcidevs_lock >> ====================================== >> - deassign_device >> - vpci_deassign_device >> ++++++++ pdev->vpci_lock is used ++++++++ >> - vpci_remove_device >> >> >> ====================================== >> modify_bars is a special case: this is the only function which tries to lock >> two pci_dev devices: it is done to check for overlaps with other BARs which may have been >> already mapped or unmapped. >> >> So, this is the only case which may deadlock because of pci_dev->vpci_lock. >> ====================================== >> >> Bottom line: >> ====================================== >> >> 1. vpci_{read|write} are not protected with pcidevs_lock and can run in >> parallel with pci_remove_device which can remove pdev after vpci_{read|write} >> acquired the pdev pointer. This may lead to a fail due to pdev dereference. >> >> So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock. > We would like to take the pcidevs_lock only while fetching the device > (ie: pci_get_pdev_by_domain), afterwards it should be fine to lock the > device using a vpci specific lock so calls to vpci_{read,write} can be > partially concurrent across multiple domains. This means this can't be done a pre-req patch, but as a part of the patch which changes locking. > > In fact I think Jan had already pointed out that the pci lock would > need taking while searching for the device in vpci_{read,write}. I was referring to the time after we found pdev and it is currently possible to free pdev while using it after the search > > It seems to me that if you implement option 3 below taking the > per-domain rwlock in read mode in vpci_{read|write} will already > protect you from the device being removed if the same per-domain lock > is taken in write mode in vpci_remove_device. Yes, it should. Again this can't be done as a pre-req patch because this relies on pdev->vpci_lock > >> 2. The only offending place which is in the way of pci_dev->vpci_lock is >> modify_bars. If it can be re-worked to track already mapped and unmapped >> regions then we can avoid having a possible deadlock and can use >> pci_dev->vpci_lock (rangesets won't help here as we also need refcounting be >> implemented). > I think a refcounting based solution will be very complex to > implement. I'm however happy to be proven wrong. I can't estimate, but I have a feeling that all these plays around locking is just because of this single piece of code. No other place suffer from pdev->vpci_lock and no d->lock > >> If pcidevs_lock is used for vpci_{read|write} then no deadlock is possible, >> but modify_bars code must be re-worked not to lock itself (pdev->vpci_lock and >> tmp->vpci_lock when pdev == tmp, this is minor). > Taking the pcidevs lock (a global lock) is out of the picture IMO, as > it's going to serialize all calls of vpci_{read|write}, and would > create too much contention on the pcidevs lock. I understand that. But if we would like to fix the existing code I see no other alternative. > >> 3. We may think about a per-domain rwlock and pdev->vpci_lock, so this solves >> modify_bars's two pdevs access. But this doesn't solve possible pdev >> de-reference in vpci_{read|write} vs pci_remove_device. > pci_remove device will call vpci_remove_device, so as long as > vpci_remove_device taken the per-domain lock in write (exclusive) mode > it should be fine. I think I need to see if there are any other places which similarly require the write lock > >> @Roger, @Jan, I would like to hear what do you think about the above analysis >> and how can we proceed with locking re-work? > I think the per-domain rwlock seems like a good option. I would do > that as a pre-patch. It is. But it seems it won't solve the thing we started this adventure for: With per-domain read lock and still ABBA in modify_bars (hope the below is correctly seen with a monospace font): cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock There is no API to upgrade read lock to write lock in modify_bars which could help, so in both cases vpci_write should take write lock. Am I missing something here? > > Thanks, Roger. Thank you, Oleksandr
On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: > On 07.02.22 14:46, Roger Pau Monné wrote: >> I think the per-domain rwlock seems like a good option. I would do >> that as a pre-patch. > It is. But it seems it won't solve the thing we started this adventure for: > > With per-domain read lock and still ABBA in modify_bars (hope the below > is correctly seen with a monospace font): > > cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock > cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock > > There is no API to upgrade read lock to write lock in modify_bars which could help, > so in both cases vpci_write should take write lock. Hmm, yes, I think you're right: It's not modify_bars() itself which needs to acquire the write lock, but its (perhaps indirect) caller. Effectively vpci_write() would need to take the write lock if the range written overlaps the BARs or the command register. Jan
On Mon, Feb 07, 2022 at 01:53:34PM +0000, Oleksandr Andrushchenko wrote: > > > On 07.02.22 14:46, Roger Pau Monné wrote: > > On Mon, Feb 07, 2022 at 11:08:39AM +0000, Oleksandr Andrushchenko wrote: > >> ====================================== > >> > >> Bottom line: > >> ====================================== > >> > >> 1. vpci_{read|write} are not protected with pcidevs_lock and can run in > >> parallel with pci_remove_device which can remove pdev after vpci_{read|write} > >> acquired the pdev pointer. This may lead to a fail due to pdev dereference. > >> > >> So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock. > > We would like to take the pcidevs_lock only while fetching the device > > (ie: pci_get_pdev_by_domain), afterwards it should be fine to lock the > > device using a vpci specific lock so calls to vpci_{read,write} can be > > partially concurrent across multiple domains. > This means this can't be done a pre-req patch, but as a part of the > patch which changes locking. > > > > In fact I think Jan had already pointed out that the pci lock would > > need taking while searching for the device in vpci_{read,write}. > I was referring to the time after we found pdev and it is currently > possible to free pdev while using it after the search > > > > It seems to me that if you implement option 3 below taking the > > per-domain rwlock in read mode in vpci_{read|write} will already > > protect you from the device being removed if the same per-domain lock > > is taken in write mode in vpci_remove_device. > Yes, it should. Again this can't be done as a pre-req patch because > this relies on pdev->vpci_lock Hm, no, I don't think so. You could introduce this per-domain rwlock in a prepatch, and then move the vpci lock outside of the vpci struct. I see no problem with that. > > > >> 2. The only offending place which is in the way of pci_dev->vpci_lock is > >> modify_bars. If it can be re-worked to track already mapped and unmapped > >> regions then we can avoid having a possible deadlock and can use > >> pci_dev->vpci_lock (rangesets won't help here as we also need refcounting be > >> implemented). > > I think a refcounting based solution will be very complex to > > implement. I'm however happy to be proven wrong. > I can't estimate, but I have a feeling that all these plays around locking > is just because of this single piece of code. No other place suffer from > pdev->vpci_lock and no d->lock > > > >> If pcidevs_lock is used for vpci_{read|write} then no deadlock is possible, > >> but modify_bars code must be re-worked not to lock itself (pdev->vpci_lock and > >> tmp->vpci_lock when pdev == tmp, this is minor). > > Taking the pcidevs lock (a global lock) is out of the picture IMO, as > > it's going to serialize all calls of vpci_{read|write}, and would > > create too much contention on the pcidevs lock. > I understand that. But if we would like to fix the existing code I see > no other alternative. > > > >> 3. We may think about a per-domain rwlock and pdev->vpci_lock, so this solves > >> modify_bars's two pdevs access. But this doesn't solve possible pdev > >> de-reference in vpci_{read|write} vs pci_remove_device. > > pci_remove device will call vpci_remove_device, so as long as > > vpci_remove_device taken the per-domain lock in write (exclusive) mode > > it should be fine. > I think I need to see if there are any other places which similarly > require the write lock > > > >> @Roger, @Jan, I would like to hear what do you think about the above analysis > >> and how can we proceed with locking re-work? > > I think the per-domain rwlock seems like a good option. I would do > > that as a pre-patch. > It is. But it seems it won't solve the thing we started this adventure for: > > With per-domain read lock and still ABBA in modify_bars (hope the below > is correctly seen with a monospace font): > > cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock > cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock > > There is no API to upgrade read lock to write lock in modify_bars which could help, > so in both cases vpci_write should take write lock. I've thought more than once that it would be nice to have a write_{upgrade,downgrade} (read_downgrade maybe?) or similar helper. I think you could also drop the read lock, take the write lock and check that &pdev->vpci->header == header in order to be sure pdev->vpci hasn't been recreated. You would have to do similar in order to get back again from a write lock into a read one. We should avoid taking the rwlock in write mode in vpci_write unconditionally. Thanks, Roger.
On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote: > On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: > > On 07.02.22 14:46, Roger Pau Monné wrote: > >> I think the per-domain rwlock seems like a good option. I would do > >> that as a pre-patch. > > It is. But it seems it won't solve the thing we started this adventure for: > > > > With per-domain read lock and still ABBA in modify_bars (hope the below > > is correctly seen with a monospace font): > > > > cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock > > cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock > > > > There is no API to upgrade read lock to write lock in modify_bars which could help, > > so in both cases vpci_write should take write lock. > > Hmm, yes, I think you're right: It's not modify_bars() itself which needs > to acquire the write lock, but its (perhaps indirect) caller. Effectively > vpci_write() would need to take the write lock if the range written > overlaps the BARs or the command register. I'm confused. If we use a per-domain rwlock approach there would be no need to lock tmp again in modify_bars, because we should hold the rwlock in write mode, so there's no ABBA? We will have however to drop the per domain read and vpci locks and pick the per-domain lock in write mode. Thanks, Roger.
On 07.02.22 16:19, Roger Pau Monné wrote: > On Mon, Feb 07, 2022 at 01:53:34PM +0000, Oleksandr Andrushchenko wrote: >> >> On 07.02.22 14:46, Roger Pau Monné wrote: >>> On Mon, Feb 07, 2022 at 11:08:39AM +0000, Oleksandr Andrushchenko wrote: >>>> ====================================== >>>> >>>> Bottom line: >>>> ====================================== >>>> >>>> 1. vpci_{read|write} are not protected with pcidevs_lock and can run in >>>> parallel with pci_remove_device which can remove pdev after vpci_{read|write} >>>> acquired the pdev pointer. This may lead to a fail due to pdev dereference. >>>> >>>> So, to protect pdev dereference vpci_{read|write} must also use pdevs_lock. >>> We would like to take the pcidevs_lock only while fetching the device >>> (ie: pci_get_pdev_by_domain), afterwards it should be fine to lock the >>> device using a vpci specific lock so calls to vpci_{read,write} can be >>> partially concurrent across multiple domains. >> This means this can't be done a pre-req patch, but as a part of the >> patch which changes locking. >>> In fact I think Jan had already pointed out that the pci lock would >>> need taking while searching for the device in vpci_{read,write}. >> I was referring to the time after we found pdev and it is currently >> possible to free pdev while using it after the search >>> It seems to me that if you implement option 3 below taking the >>> per-domain rwlock in read mode in vpci_{read|write} will already >>> protect you from the device being removed if the same per-domain lock >>> is taken in write mode in vpci_remove_device. >> Yes, it should. Again this can't be done as a pre-req patch because >> this relies on pdev->vpci_lock > Hm, no, I don't think so. You could introduce this per-domain rwlock > in a prepatch, and then move the vpci lock outside of the vpci struct. > I see no problem with that. > >>>> 2. The only offending place which is in the way of pci_dev->vpci_lock is >>>> modify_bars. If it can be re-worked to track already mapped and unmapped >>>> regions then we can avoid having a possible deadlock and can use >>>> pci_dev->vpci_lock (rangesets won't help here as we also need refcounting be >>>> implemented). >>> I think a refcounting based solution will be very complex to >>> implement. I'm however happy to be proven wrong. >> I can't estimate, but I have a feeling that all these plays around locking >> is just because of this single piece of code. No other place suffer from >> pdev->vpci_lock and no d->lock >>>> If pcidevs_lock is used for vpci_{read|write} then no deadlock is possible, >>>> but modify_bars code must be re-worked not to lock itself (pdev->vpci_lock and >>>> tmp->vpci_lock when pdev == tmp, this is minor). >>> Taking the pcidevs lock (a global lock) is out of the picture IMO, as >>> it's going to serialize all calls of vpci_{read|write}, and would >>> create too much contention on the pcidevs lock. >> I understand that. But if we would like to fix the existing code I see >> no other alternative. >>>> 3. We may think about a per-domain rwlock and pdev->vpci_lock, so this solves >>>> modify_bars's two pdevs access. But this doesn't solve possible pdev >>>> de-reference in vpci_{read|write} vs pci_remove_device. >>> pci_remove device will call vpci_remove_device, so as long as >>> vpci_remove_device taken the per-domain lock in write (exclusive) mode >>> it should be fine. >> I think I need to see if there are any other places which similarly >> require the write lock >>>> @Roger, @Jan, I would like to hear what do you think about the above analysis >>>> and how can we proceed with locking re-work? >>> I think the per-domain rwlock seems like a good option. I would do >>> that as a pre-patch. >> It is. But it seems it won't solve the thing we started this adventure for: >> >> With per-domain read lock and still ABBA in modify_bars (hope the below >> is correctly seen with a monospace font): >> >> cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock >> cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock >> >> There is no API to upgrade read lock to write lock in modify_bars which could help, >> so in both cases vpci_write should take write lock. > I've thought more than once that it would be nice to have a > write_{upgrade,downgrade} (read_downgrade maybe?) or similar helper. Yes, this is the real use-case for that > > I think you could also drop the read lock, take the write lock and > check that &pdev->vpci->header == header in order to be sure > pdev->vpci hasn't been recreated. And have pdev freed in between.... > You would have to do similar in > order to get back again from a write lock into a read one. Not sure this is reliable. > > We should avoid taking the rwlock in write mode in vpci_write > unconditionally. Yes, but without upgrading the read lock I see no way it can be done > > Thanks, Roger. Thank you, Oleksandr
On 07.02.22 16:11, Jan Beulich wrote: > On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: >> On 07.02.22 14:46, Roger Pau Monné wrote: >>> I think the per-domain rwlock seems like a good option. I would do >>> that as a pre-patch. >> It is. But it seems it won't solve the thing we started this adventure for: >> >> With per-domain read lock and still ABBA in modify_bars (hope the below >> is correctly seen with a monospace font): >> >> cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock >> cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock >> >> There is no API to upgrade read lock to write lock in modify_bars which could help, >> so in both cases vpci_write should take write lock. > Hmm, yes, I think you're right: It's not modify_bars() itself which needs > to acquire the write lock, but its (perhaps indirect) caller. Effectively > vpci_write() would need to take the write lock if the range written > overlaps the BARs or the command register. Exactly, vpci_write needs a write lock, but it is not desirable. And again, there is a single offending piece of code which wants that... > Jan > Thank you, Oleksandr
On 07.02.2022 15:27, Roger Pau Monné wrote: > On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote: >> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: >>> On 07.02.22 14:46, Roger Pau Monné wrote: >>>> I think the per-domain rwlock seems like a good option. I would do >>>> that as a pre-patch. >>> It is. But it seems it won't solve the thing we started this adventure for: >>> >>> With per-domain read lock and still ABBA in modify_bars (hope the below >>> is correctly seen with a monospace font): >>> >>> cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock >>> cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock >>> >>> There is no API to upgrade read lock to write lock in modify_bars which could help, >>> so in both cases vpci_write should take write lock. >> >> Hmm, yes, I think you're right: It's not modify_bars() itself which needs >> to acquire the write lock, but its (perhaps indirect) caller. Effectively >> vpci_write() would need to take the write lock if the range written >> overlaps the BARs or the command register. > > I'm confused. If we use a per-domain rwlock approach there would be no > need to lock tmp again in modify_bars, because we should hold the > rwlock in write mode, so there's no ABBA? > > We will have however to drop the per domain read and vpci locks and > pick the per-domain lock in write mode. Well, yes, with intermediate dropping of the lock acquiring in write mode can be done in modify_bars(). I'm not convinced (yet) that such intermediate dropping is actually going to be okay. Jan
On 07.02.22 16:27, Roger Pau Monné wrote: > On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote: >> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: >>> On 07.02.22 14:46, Roger Pau Monné wrote: >>>> I think the per-domain rwlock seems like a good option. I would do >>>> that as a pre-patch. >>> It is. But it seems it won't solve the thing we started this adventure for: >>> >>> With per-domain read lock and still ABBA in modify_bars (hope the below >>> is correctly seen with a monospace font): >>> >>> cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock >>> cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock >>> >>> There is no API to upgrade read lock to write lock in modify_bars which could help, >>> so in both cases vpci_write should take write lock. >> Hmm, yes, I think you're right: It's not modify_bars() itself which needs >> to acquire the write lock, but its (perhaps indirect) caller. Effectively >> vpci_write() would need to take the write lock if the range written >> overlaps the BARs or the command register. > I'm confused. If we use a per-domain rwlock approach there would be no > need to lock tmp again in modify_bars, because we should hold the > rwlock in write mode, so there's no ABBA? this is only possible with what you wrote below: > > We will have however to drop the per domain read and vpci locks and > pick the per-domain lock in write mode. I think this is going to be unreliable. We need a reliable way to upgrade read lock to write lock. Then, we can drop pdev->vpci_lock at all, because we are always protected with d->rwlock and those who want to free pdev->vpci will use write lock. So, per-domain rwlock with write upgrade implemented minus pdev->vpci should do the trick > Thanks, Roger. Thank you, Oleksandr
On 07.02.22 16:35, Oleksandr Andrushchenko wrote: > > On 07.02.22 16:27, Roger Pau Monné wrote: >> On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote: >>> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: >>>> On 07.02.22 14:46, Roger Pau Monné wrote: >>>>> I think the per-domain rwlock seems like a good option. I would do >>>>> that as a pre-patch. >>>> It is. But it seems it won't solve the thing we started this adventure for: >>>> >>>> With per-domain read lock and still ABBA in modify_bars (hope the below >>>> is correctly seen with a monospace font): >>>> >>>> cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock >>>> cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock >>>> >>>> There is no API to upgrade read lock to write lock in modify_bars which could help, >>>> so in both cases vpci_write should take write lock. >>> Hmm, yes, I think you're right: It's not modify_bars() itself which needs >>> to acquire the write lock, but its (perhaps indirect) caller. Effectively >>> vpci_write() would need to take the write lock if the range written >>> overlaps the BARs or the command register. >> I'm confused. If we use a per-domain rwlock approach there would be no >> need to lock tmp again in modify_bars, because we should hold the >> rwlock in write mode, so there's no ABBA? > this is only possible with what you wrote below: >> We will have however to drop the per domain read and vpci locks and >> pick the per-domain lock in write mode. > I think this is going to be unreliable. We need a reliable way to > upgrade read lock to write lock. > Then, we can drop pdev->vpci_lock at all, because we are always > protected with d->rwlock and those who want to free pdev->vpci > will use write lock. > > So, per-domain rwlock with write upgrade implemented minus pdev->vpci > should do the trick Linux doesn't implement write upgrade and it seems for a reason [1]: "Also, you cannot “upgrade” a read-lock to a write-lock, so if you at _any_ time need to do any changes (even if you don’t do it every time), you have to get the write-lock at the very beginning." So, I am not sure we can have the same for Xen... At the moment I see at least two possible ways to solve the issue: 1. Make vpci_write use write lock, thus make all write accesses synchronized for the given domain, read are fully parallel 2. Re-implement pdev/tmp overlapping detection with something which won't require pdev->vpci_lock/tmp->vpci_lock 3. Drop read and acquire write lock in modify_bars... but this is not reliable and will hide a free(pdev->vpci) bug @Roger, @Jan: Any other suggestions? Thank you, Oleksandr [1] https://www.kernel.org/doc/html/latest/locking/spinlocks.html#lesson-2-reader-writer-spinlocks
On 07.02.2022 16:11, Oleksandr Andrushchenko wrote: > > > On 07.02.22 16:35, Oleksandr Andrushchenko wrote: >> >> On 07.02.22 16:27, Roger Pau Monné wrote: >>> On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote: >>>> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: >>>>> On 07.02.22 14:46, Roger Pau Monné wrote: >>>>>> I think the per-domain rwlock seems like a good option. I would do >>>>>> that as a pre-patch. >>>>> It is. But it seems it won't solve the thing we started this adventure for: >>>>> >>>>> With per-domain read lock and still ABBA in modify_bars (hope the below >>>>> is correctly seen with a monospace font): >>>>> >>>>> cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock >>>>> cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock >>>>> >>>>> There is no API to upgrade read lock to write lock in modify_bars which could help, >>>>> so in both cases vpci_write should take write lock. >>>> Hmm, yes, I think you're right: It's not modify_bars() itself which needs >>>> to acquire the write lock, but its (perhaps indirect) caller. Effectively >>>> vpci_write() would need to take the write lock if the range written >>>> overlaps the BARs or the command register. >>> I'm confused. If we use a per-domain rwlock approach there would be no >>> need to lock tmp again in modify_bars, because we should hold the >>> rwlock in write mode, so there's no ABBA? >> this is only possible with what you wrote below: >>> We will have however to drop the per domain read and vpci locks and >>> pick the per-domain lock in write mode. >> I think this is going to be unreliable. We need a reliable way to >> upgrade read lock to write lock. >> Then, we can drop pdev->vpci_lock at all, because we are always >> protected with d->rwlock and those who want to free pdev->vpci >> will use write lock. >> >> So, per-domain rwlock with write upgrade implemented minus pdev->vpci >> should do the trick > Linux doesn't implement write upgrade and it seems for a reason [1]: > "Also, you cannot “upgrade” a read-lock to a write-lock, so if you at _any_ time > need to do any changes (even if you don’t do it every time), you have to get > the write-lock at the very beginning." > > So, I am not sure we can have the same for Xen... > > At the moment I see at least two possible ways to solve the issue: > 1. Make vpci_write use write lock, thus make all write accesses synchronized > for the given domain, read are fully parallel 1b. Make vpci_write use write lock for writes to command register and BARs only; keep using the read lock for all other writes. Jan > 2. Re-implement pdev/tmp overlapping detection with something which won't > require pdev->vpci_lock/tmp->vpci_lock > > 3. Drop read and acquire write lock in modify_bars... but this is not reliable > and will hide a free(pdev->vpci) bug > > @Roger, @Jan: Any other suggestions? > > Thank you, > Oleksandr > > [1] https://www.kernel.org/doc/html/latest/locking/spinlocks.html#lesson-2-reader-writer-spinlocks
On 07.02.22 17:26, Jan Beulich wrote: > On 07.02.2022 16:11, Oleksandr Andrushchenko wrote: >> >> On 07.02.22 16:35, Oleksandr Andrushchenko wrote: >>> On 07.02.22 16:27, Roger Pau Monné wrote: >>>> On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote: >>>>> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: >>>>>> On 07.02.22 14:46, Roger Pau Monné wrote: >>>>>>> I think the per-domain rwlock seems like a good option. I would do >>>>>>> that as a pre-patch. >>>>>> It is. But it seems it won't solve the thing we started this adventure for: >>>>>> >>>>>> With per-domain read lock and still ABBA in modify_bars (hope the below >>>>>> is correctly seen with a monospace font): >>>>>> >>>>>> cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock >>>>>> cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock >>>>>> >>>>>> There is no API to upgrade read lock to write lock in modify_bars which could help, >>>>>> so in both cases vpci_write should take write lock. >>>>> Hmm, yes, I think you're right: It's not modify_bars() itself which needs >>>>> to acquire the write lock, but its (perhaps indirect) caller. Effectively >>>>> vpci_write() would need to take the write lock if the range written >>>>> overlaps the BARs or the command register. >>>> I'm confused. If we use a per-domain rwlock approach there would be no >>>> need to lock tmp again in modify_bars, because we should hold the >>>> rwlock in write mode, so there's no ABBA? >>> this is only possible with what you wrote below: >>>> We will have however to drop the per domain read and vpci locks and >>>> pick the per-domain lock in write mode. >>> I think this is going to be unreliable. We need a reliable way to >>> upgrade read lock to write lock. >>> Then, we can drop pdev->vpci_lock at all, because we are always >>> protected with d->rwlock and those who want to free pdev->vpci >>> will use write lock. >>> >>> So, per-domain rwlock with write upgrade implemented minus pdev->vpci >>> should do the trick >> Linux doesn't implement write upgrade and it seems for a reason [1]: >> "Also, you cannot “upgrade” a read-lock to a write-lock, so if you at _any_ time >> need to do any changes (even if you don’t do it every time), you have to get >> the write-lock at the very beginning." >> >> So, I am not sure we can have the same for Xen... >> >> At the moment I see at least two possible ways to solve the issue: >> 1. Make vpci_write use write lock, thus make all write accesses synchronized >> for the given domain, read are fully parallel > 1b. Make vpci_write use write lock for writes to command register and BARs > only; keep using the read lock for all other writes. I am not quite sure how to do that. Do you mean something like: void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, uint32_t data) [snip] list_for_each_entry ( r, &pdev->vpci->handlers, node ) { [snip] if ( r->needs_write_lock) write_lock(d->vpci_lock) else read_lock(d->vpci_lock) .... And provide rw as an argument to: int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, vpci_write_t *write_handler, unsigned int offset, unsigned int size, void *data, --->>> bool write_path <<<-----) Is this what you mean? With the above, if we have d->vpci_lock, I think we can drop pdev->vpci_lock at all Thank you, Oleksandr P.S. I don't think you mean we just drop the read lock and acquire write lock as it leads to the mentioned before unreliability.
On Mon, Feb 07, 2022 at 04:26:56PM +0100, Jan Beulich wrote: > On 07.02.2022 16:11, Oleksandr Andrushchenko wrote: > > > > > > On 07.02.22 16:35, Oleksandr Andrushchenko wrote: > >> > >> On 07.02.22 16:27, Roger Pau Monné wrote: > >>> On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote: > >>>> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: > >>>>> On 07.02.22 14:46, Roger Pau Monné wrote: > >>>>>> I think the per-domain rwlock seems like a good option. I would do > >>>>>> that as a pre-patch. > >>>>> It is. But it seems it won't solve the thing we started this adventure for: > >>>>> > >>>>> With per-domain read lock and still ABBA in modify_bars (hope the below > >>>>> is correctly seen with a monospace font): > >>>>> > >>>>> cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock > >>>>> cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock > >>>>> > >>>>> There is no API to upgrade read lock to write lock in modify_bars which could help, > >>>>> so in both cases vpci_write should take write lock. > >>>> Hmm, yes, I think you're right: It's not modify_bars() itself which needs > >>>> to acquire the write lock, but its (perhaps indirect) caller. Effectively > >>>> vpci_write() would need to take the write lock if the range written > >>>> overlaps the BARs or the command register. > >>> I'm confused. If we use a per-domain rwlock approach there would be no > >>> need to lock tmp again in modify_bars, because we should hold the > >>> rwlock in write mode, so there's no ABBA? > >> this is only possible with what you wrote below: > >>> We will have however to drop the per domain read and vpci locks and > >>> pick the per-domain lock in write mode. > >> I think this is going to be unreliable. We need a reliable way to > >> upgrade read lock to write lock. > >> Then, we can drop pdev->vpci_lock at all, because we are always > >> protected with d->rwlock and those who want to free pdev->vpci > >> will use write lock. > >> > >> So, per-domain rwlock with write upgrade implemented minus pdev->vpci > >> should do the trick > > Linux doesn't implement write upgrade and it seems for a reason [1]: > > "Also, you cannot “upgrade” a read-lock to a write-lock, so if you at _any_ time > > need to do any changes (even if you don’t do it every time), you have to get > > the write-lock at the very beginning." > > > > So, I am not sure we can have the same for Xen... > > > > At the moment I see at least two possible ways to solve the issue: > > 1. Make vpci_write use write lock, thus make all write accesses synchronized > > for the given domain, read are fully parallel > > 1b. Make vpci_write use write lock for writes to command register and BARs > only; keep using the read lock for all other writes. We do not support writing to the BARs with memory decoding enabled currently for dom0, so we would only need to pick the lock in write mode for the command register and ROM BAR write handler AFAICT. Thanks, Roger.
On 07.02.2022 17:08, Roger Pau Monné wrote: > On Mon, Feb 07, 2022 at 04:26:56PM +0100, Jan Beulich wrote: >> On 07.02.2022 16:11, Oleksandr Andrushchenko wrote: >>> >>> >>> On 07.02.22 16:35, Oleksandr Andrushchenko wrote: >>>> >>>> On 07.02.22 16:27, Roger Pau Monné wrote: >>>>> On Mon, Feb 07, 2022 at 03:11:03PM +0100, Jan Beulich wrote: >>>>>> On 07.02.2022 14:53, Oleksandr Andrushchenko wrote: >>>>>>> On 07.02.22 14:46, Roger Pau Monné wrote: >>>>>>>> I think the per-domain rwlock seems like a good option. I would do >>>>>>>> that as a pre-patch. >>>>>>> It is. But it seems it won't solve the thing we started this adventure for: >>>>>>> >>>>>>> With per-domain read lock and still ABBA in modify_bars (hope the below >>>>>>> is correctly seen with a monospace font): >>>>>>> >>>>>>> cpu0: vpci_write-> d->RLock -> pdev1->lock -> rom_write -> modify_bars: tmp (pdev2) ->lock >>>>>>> cpu1: vpci_write-> d->RLock pdev2->lock -> cmd_write -> modify_bars: tmp (pdev1) ->lock >>>>>>> >>>>>>> There is no API to upgrade read lock to write lock in modify_bars which could help, >>>>>>> so in both cases vpci_write should take write lock. >>>>>> Hmm, yes, I think you're right: It's not modify_bars() itself which needs >>>>>> to acquire the write lock, but its (perhaps indirect) caller. Effectively >>>>>> vpci_write() would need to take the write lock if the range written >>>>>> overlaps the BARs or the command register. >>>>> I'm confused. If we use a per-domain rwlock approach there would be no >>>>> need to lock tmp again in modify_bars, because we should hold the >>>>> rwlock in write mode, so there's no ABBA? >>>> this is only possible with what you wrote below: >>>>> We will have however to drop the per domain read and vpci locks and >>>>> pick the per-domain lock in write mode. >>>> I think this is going to be unreliable. We need a reliable way to >>>> upgrade read lock to write lock. >>>> Then, we can drop pdev->vpci_lock at all, because we are always >>>> protected with d->rwlock and those who want to free pdev->vpci >>>> will use write lock. >>>> >>>> So, per-domain rwlock with write upgrade implemented minus pdev->vpci >>>> should do the trick >>> Linux doesn't implement write upgrade and it seems for a reason [1]: >>> "Also, you cannot “upgrade” a read-lock to a write-lock, so if you at _any_ time >>> need to do any changes (even if you don’t do it every time), you have to get >>> the write-lock at the very beginning." >>> >>> So, I am not sure we can have the same for Xen... >>> >>> At the moment I see at least two possible ways to solve the issue: >>> 1. Make vpci_write use write lock, thus make all write accesses synchronized >>> for the given domain, read are fully parallel >> >> 1b. Make vpci_write use write lock for writes to command register and BARs >> only; keep using the read lock for all other writes. > > We do not support writing to the BARs with memory decoding enabled > currently for dom0, so we would only need to pick the lock in write > mode for the command register and ROM BAR write handler AFAICT. Oh, right - this then makes for even less contention due to needing to acquire the lock in write mode. Jan
On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: > On 07.02.22 17:26, Jan Beulich wrote: >> 1b. Make vpci_write use write lock for writes to command register and BARs >> only; keep using the read lock for all other writes. > I am not quite sure how to do that. Do you mean something like: > void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, > uint32_t data) > [snip] > list_for_each_entry ( r, &pdev->vpci->handlers, node ) > { > [snip] > if ( r->needs_write_lock) > write_lock(d->vpci_lock) > else > read_lock(d->vpci_lock) > .... > > And provide rw as an argument to: > > int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, > vpci_write_t *write_handler, unsigned int offset, > unsigned int size, void *data, --->>> bool write_path <<<-----) > > Is this what you mean? This sounds overly complicated. You can derive locally in vpci_write(), from just its "reg" and "size" parameters, whether the lock needs taking in write mode. Jan
On 07.02.22 18:15, Jan Beulich wrote: > On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: >> On 07.02.22 17:26, Jan Beulich wrote: >>> 1b. Make vpci_write use write lock for writes to command register and BARs >>> only; keep using the read lock for all other writes. >> I am not quite sure how to do that. Do you mean something like: >> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, >> uint32_t data) >> [snip] >> list_for_each_entry ( r, &pdev->vpci->handlers, node ) >> { >> [snip] >> if ( r->needs_write_lock) >> write_lock(d->vpci_lock) >> else >> read_lock(d->vpci_lock) >> .... >> >> And provide rw as an argument to: >> >> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, >> vpci_write_t *write_handler, unsigned int offset, >> unsigned int size, void *data, --->>> bool write_path <<<-----) >> >> Is this what you mean? > This sounds overly complicated. You can derive locally in vpci_write(), > from just its "reg" and "size" parameters, whether the lock needs taking > in write mode. Yes, I started writing a reply with that. So, the summary (ROM position depends on header type): if ( (reg == PCI_COMMAND) || (reg == ROM) ) { read PCI_COMMAND and see if memory or IO decoding are enabled. if ( enabled ) write_lock(d->vpci_lock) else read_lock(d->vpci_lock) } Do you also think we can drop pdev->vpci (or currently pdev->vpci->lock) at all then? > Jan > > Thank you, Oleksandr
On 07.02.2022 17:21, Oleksandr Andrushchenko wrote: > > > On 07.02.22 18:15, Jan Beulich wrote: >> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: >>> On 07.02.22 17:26, Jan Beulich wrote: >>>> 1b. Make vpci_write use write lock for writes to command register and BARs >>>> only; keep using the read lock for all other writes. >>> I am not quite sure how to do that. Do you mean something like: >>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, >>> uint32_t data) >>> [snip] >>> list_for_each_entry ( r, &pdev->vpci->handlers, node ) >>> { >>> [snip] >>> if ( r->needs_write_lock) >>> write_lock(d->vpci_lock) >>> else >>> read_lock(d->vpci_lock) >>> .... >>> >>> And provide rw as an argument to: >>> >>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, >>> vpci_write_t *write_handler, unsigned int offset, >>> unsigned int size, void *data, --->>> bool write_path <<<-----) >>> >>> Is this what you mean? >> This sounds overly complicated. You can derive locally in vpci_write(), >> from just its "reg" and "size" parameters, whether the lock needs taking >> in write mode. > Yes, I started writing a reply with that. So, the summary (ROM > position depends on header type): > if ( (reg == PCI_COMMAND) || (reg == ROM) ) > { > read PCI_COMMAND and see if memory or IO decoding are enabled. > if ( enabled ) > write_lock(d->vpci_lock) > else > read_lock(d->vpci_lock) > } Hmm, yes, you can actually get away without using "size", since both command register and ROM BAR are 32-bit aligned registers, and 64-bit accesses get split in vpci_ecam_write(). For the command register the memory- / IO-decoding-enabled check may end up a little more complicated, as the value to be written also matters. Maybe read the command register only for the ROM BAR write, using the write lock uniformly for all command register writes? > Do you also think we can drop pdev->vpci (or currently pdev->vpci->lock) > at all then? I haven't looked at this in any detail, sorry. It sounds possible, yes. Jan
On 07.02.22 18:37, Jan Beulich wrote: > On 07.02.2022 17:21, Oleksandr Andrushchenko wrote: >> >> On 07.02.22 18:15, Jan Beulich wrote: >>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: >>>> On 07.02.22 17:26, Jan Beulich wrote: >>>>> 1b. Make vpci_write use write lock for writes to command register and BARs >>>>> only; keep using the read lock for all other writes. >>>> I am not quite sure how to do that. Do you mean something like: >>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, >>>> uint32_t data) >>>> [snip] >>>> list_for_each_entry ( r, &pdev->vpci->handlers, node ) >>>> { >>>> [snip] >>>> if ( r->needs_write_lock) >>>> write_lock(d->vpci_lock) >>>> else >>>> read_lock(d->vpci_lock) >>>> .... >>>> >>>> And provide rw as an argument to: >>>> >>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, >>>> vpci_write_t *write_handler, unsigned int offset, >>>> unsigned int size, void *data, --->>> bool write_path <<<-----) >>>> >>>> Is this what you mean? >>> This sounds overly complicated. You can derive locally in vpci_write(), >>> from just its "reg" and "size" parameters, whether the lock needs taking >>> in write mode. >> Yes, I started writing a reply with that. So, the summary (ROM >> position depends on header type): >> if ( (reg == PCI_COMMAND) || (reg == ROM) ) >> { >> read PCI_COMMAND and see if memory or IO decoding are enabled. >> if ( enabled ) >> write_lock(d->vpci_lock) >> else >> read_lock(d->vpci_lock) >> } > Hmm, yes, you can actually get away without using "size", since both > command register and ROM BAR are 32-bit aligned registers, and 64-bit > accesses get split in vpci_ecam_write(). But, OS may want reading a single byte of ROM BAR, so I think I'll need to check if reg+size fall into PCI_COMAND and ROM BAR ranges > > For the command register the memory- / IO-decoding-enabled check may > end up a little more complicated, as the value to be written also > matters. Maybe read the command register only for the ROM BAR write, > using the write lock uniformly for all command register writes? Sounds good for the start. Another concern is that if we go with a read_lock and then in the underlying code we disable memory decoding and try doing something and calling cmd_write handler for any reason then.... I mean that the check in the vpci_write is somewhat we can tolerate, but then it is must be considered that no code in the read path is allowed to perform write path functions. Which brings a pretty valid use-case: say in read mode we detect an unrecoverable error and need to remove the device: vpci_process_pending -> ERROR -> vpci_remove_device or similar. What do we do then? It is all going to be fragile... > >> Do you also think we can drop pdev->vpci (or currently pdev->vpci->lock) >> at all then? > I haven't looked at this in any detail, sorry. It sounds possible, > yes. > > Jan > Thank you, Oleksandr
On 07.02.22 18:44, Oleksandr Andrushchenko wrote: > > On 07.02.22 18:37, Jan Beulich wrote: >> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote: >>> On 07.02.22 18:15, Jan Beulich wrote: >>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: >>>>> On 07.02.22 17:26, Jan Beulich wrote: >>>>>> 1b. Make vpci_write use write lock for writes to command register and BARs >>>>>> only; keep using the read lock for all other writes. >>>>> I am not quite sure how to do that. Do you mean something like: >>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, >>>>> uint32_t data) >>>>> [snip] >>>>> list_for_each_entry ( r, &pdev->vpci->handlers, node ) >>>>> { >>>>> [snip] >>>>> if ( r->needs_write_lock) >>>>> write_lock(d->vpci_lock) >>>>> else >>>>> read_lock(d->vpci_lock) >>>>> .... >>>>> >>>>> And provide rw as an argument to: >>>>> >>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, >>>>> vpci_write_t *write_handler, unsigned int offset, >>>>> unsigned int size, void *data, --->>> bool write_path <<<-----) >>>>> >>>>> Is this what you mean? >>>> This sounds overly complicated. You can derive locally in vpci_write(), >>>> from just its "reg" and "size" parameters, whether the lock needs taking >>>> in write mode. >>> Yes, I started writing a reply with that. So, the summary (ROM >>> position depends on header type): >>> if ( (reg == PCI_COMMAND) || (reg == ROM) ) >>> { >>> read PCI_COMMAND and see if memory or IO decoding are enabled. >>> if ( enabled ) >>> write_lock(d->vpci_lock) >>> else >>> read_lock(d->vpci_lock) >>> } >> Hmm, yes, you can actually get away without using "size", since both >> command register and ROM BAR are 32-bit aligned registers, and 64-bit >> accesses get split in vpci_ecam_write(). > But, OS may want reading a single byte of ROM BAR, so I think > I'll need to check if reg+size fall into PCI_COMAND and ROM BAR > ranges >> For the command register the memory- / IO-decoding-enabled check may >> end up a little more complicated, as the value to be written also >> matters. Maybe read the command register only for the ROM BAR write, >> using the write lock uniformly for all command register writes? > Sounds good for the start. > Another concern is that if we go with a read_lock and then in the > underlying code we disable memory decoding and try doing > something and calling cmd_write handler for any reason then.... > > I mean that the check in the vpci_write is somewhat we can tolerate, > but then it is must be considered that no code in the read path > is allowed to perform write path functions. Which brings a pretty > valid use-case: say in read mode we detect an unrecoverable error > and need to remove the device: > vpci_process_pending -> ERROR -> vpci_remove_device or similar. > > What do we do then? It is all going to be fragile... I have tried to summarize the options we have wrt locking and would love to hear from @Roger and @Jan. In every variant there is a task of dealing with the overlap detection in modify_bars, so this is the only place as of now which needs special treatment. Existing limitations: there is no way to upgrade a read lock to a write lock, so paths which may require write lock protection need to use write lock from the very beginning. Workarounds can be applied. 1. Per-domain rw lock, aka d->vpci_lock ============================================================== Note: with per-domain rw lock it is possible to do without introducing per-device locks, so pdev->vpci->lock can be removed and no pdev->vpci_lock should be required. This is only going to work in case if vpci_write always takes the write lock and vpci_read takes a read lock and no path in vpci_read is allowed to perform write path operations. vpci_process_pending uses write lock as it have vpci_remove_device in its error path. Pros: - no per-device vpci lock is needed? - solves overlap code ABBA in modify_bars Cons: - all writes are serialized - need to carefully select read paths, so they are guaranteed not to lead to lock upgrade use-cases 1.1. Semi read lock upgrade in modify bars -------------------------------------------------------------- In this case both vpci_read and vpci_write take a read lock and when it comes to modify_bars: 1. read_unlock(d->vpci_lock) 2. write_lock(d->vpci_lock) 3. Check that pdev->vpci is still available and is the same object: if (pdev->vpci && (pdev->vpci == old_vpci) ) { /* vpci structure is valid and can be used. */ } else { /* vpci has gone, return an error. */ } Pros: - no per-device vpci lock is needed? - solves overlap code ABBA in modify_bars - readers and writers are NOT serialized - NO need to carefully select read paths, so they are guaranteed not to lead to lock upgrade use-cases Cons: - ??? 2. per-device lock (pdev->vpci_lock) + d->overlap_chk_lock ============================================================== In order to solve overlap ABBA, we introduce a per-domain helper lock to protect the overlapping code in modify_bars: old_vpci = pdev->vpci; spin_unlock(pdev->vpci_lock); spin_lock(pdev->domain->overlap_chk_lock); spin_lock(pdev->vpci_lock); if ( pdev->vpci && (pdev->vpci == old_vpci) ) for_each_pdev ( pdev->domain, tmp ) { if ( tmp != pdev ) { spin_lock(tmp->vpci_lock); if ( tmp->vpci ) ... } } Pros: - all accesses are independent, only the same device access is serialized - no need to care about readers and writers wrt read lock upgrade issues Cons: - helper spin lock 3. Move overlap detection into process pending ============================================================== There is a Roger's patch [1] which adds a possibility for vpci_process_pending to perform different tasks rather than just map/unmap. With this patch extended in a way that it can hold a request queue it is possible to delay execution of the overlap code until no pdev->vpci_lock is held, but before returning to a guest after vpci_{read|write} or similar. Pros: - no need to emulate read lock upgrade - fully parallel read/write - queue in the vpci_process_pending will later on be used by SR-IOV, so this is going to help the future code Cons: - ??? 4. Re-write overlap detection code ============================================================== It is possible to re-write overlap detection code, so the information about the mapped/unmapped regions is not read from vpci->header->bars[i] of each device, but instead there is a per-domain structure which holds the regions and implements reference counting. Pros: - solves ABBA Cons: - very complex code is expected 5. You name it ============================================================== From all the above I would recommend we go with option 2 which seems to reliably solve ABBA and does not bring cons of the other approaches. Thank you in advance, Oleksandr [1] https://lore.kernel.org/all/5BABA6EF02000078001EC452@prv1-mh.provo.novell.com/T/#m231fb0586007725bfd8538bb97ff1777a36842cf
On 07.02.2022 17:44, Oleksandr Andrushchenko wrote: > > > On 07.02.22 18:37, Jan Beulich wrote: >> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote: >>> >>> On 07.02.22 18:15, Jan Beulich wrote: >>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: >>>>> On 07.02.22 17:26, Jan Beulich wrote: >>>>>> 1b. Make vpci_write use write lock for writes to command register and BARs >>>>>> only; keep using the read lock for all other writes. >>>>> I am not quite sure how to do that. Do you mean something like: >>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, >>>>> uint32_t data) >>>>> [snip] >>>>> list_for_each_entry ( r, &pdev->vpci->handlers, node ) >>>>> { >>>>> [snip] >>>>> if ( r->needs_write_lock) >>>>> write_lock(d->vpci_lock) >>>>> else >>>>> read_lock(d->vpci_lock) >>>>> .... >>>>> >>>>> And provide rw as an argument to: >>>>> >>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, >>>>> vpci_write_t *write_handler, unsigned int offset, >>>>> unsigned int size, void *data, --->>> bool write_path <<<-----) >>>>> >>>>> Is this what you mean? >>>> This sounds overly complicated. You can derive locally in vpci_write(), >>>> from just its "reg" and "size" parameters, whether the lock needs taking >>>> in write mode. >>> Yes, I started writing a reply with that. So, the summary (ROM >>> position depends on header type): >>> if ( (reg == PCI_COMMAND) || (reg == ROM) ) >>> { >>> read PCI_COMMAND and see if memory or IO decoding are enabled. >>> if ( enabled ) >>> write_lock(d->vpci_lock) >>> else >>> read_lock(d->vpci_lock) >>> } >> Hmm, yes, you can actually get away without using "size", since both >> command register and ROM BAR are 32-bit aligned registers, and 64-bit >> accesses get split in vpci_ecam_write(). > But, OS may want reading a single byte of ROM BAR, so I think > I'll need to check if reg+size fall into PCI_COMAND and ROM BAR > ranges >> >> For the command register the memory- / IO-decoding-enabled check may >> end up a little more complicated, as the value to be written also >> matters. Maybe read the command register only for the ROM BAR write, >> using the write lock uniformly for all command register writes? > Sounds good for the start. > Another concern is that if we go with a read_lock and then in the > underlying code we disable memory decoding and try doing > something and calling cmd_write handler for any reason then.... > > I mean that the check in the vpci_write is somewhat we can tolerate, > but then it is must be considered that no code in the read path > is allowed to perform write path functions. Which brings a pretty > valid use-case: say in read mode we detect an unrecoverable error > and need to remove the device: > vpci_process_pending -> ERROR -> vpci_remove_device or similar. > > What do we do then? It is all going to be fragile... Real hardware won't cause a device to disappear upon a problem with a read access. There shouldn't be any need to remove a passed-through device either; such problems (if any) need handling differently imo. Jan
On 08.02.2022 08:35, Oleksandr Andrushchenko wrote: > 1.1. Semi read lock upgrade in modify bars > -------------------------------------------------------------- > In this case both vpci_read and vpci_write take a read lock and when it comes > to modify_bars: > > 1. read_unlock(d->vpci_lock) > 2. write_lock(d->vpci_lock) > 3. Check that pdev->vpci is still available and is the same object: > if (pdev->vpci && (pdev->vpci == old_vpci) ) > { > /* vpci structure is valid and can be used. */ > } > else > { > /* vpci has gone, return an error. */ > } > > Pros: > - no per-device vpci lock is needed? > - solves overlap code ABBA in modify_bars > - readers and writers are NOT serialized > - NO need to carefully select read paths, so they are guaranteed not to lead > to lock upgrade use-cases > > Cons: > - ??? The "pdev->vpci == old_vpci" is fragile: The struct may have got re- allocated, and it just so happened that the two pointers are identical. Same then for the subsequent variant 2. Jan
On 08.02.22 10:53, Jan Beulich wrote: > On 07.02.2022 17:44, Oleksandr Andrushchenko wrote: >> >> On 07.02.22 18:37, Jan Beulich wrote: >>> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote: >>>> On 07.02.22 18:15, Jan Beulich wrote: >>>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: >>>>>> On 07.02.22 17:26, Jan Beulich wrote: >>>>>>> 1b. Make vpci_write use write lock for writes to command register and BARs >>>>>>> only; keep using the read lock for all other writes. >>>>>> I am not quite sure how to do that. Do you mean something like: >>>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, >>>>>> uint32_t data) >>>>>> [snip] >>>>>> list_for_each_entry ( r, &pdev->vpci->handlers, node ) >>>>>> { >>>>>> [snip] >>>>>> if ( r->needs_write_lock) >>>>>> write_lock(d->vpci_lock) >>>>>> else >>>>>> read_lock(d->vpci_lock) >>>>>> .... >>>>>> >>>>>> And provide rw as an argument to: >>>>>> >>>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, >>>>>> vpci_write_t *write_handler, unsigned int offset, >>>>>> unsigned int size, void *data, --->>> bool write_path <<<-----) >>>>>> >>>>>> Is this what you mean? >>>>> This sounds overly complicated. You can derive locally in vpci_write(), >>>>> from just its "reg" and "size" parameters, whether the lock needs taking >>>>> in write mode. >>>> Yes, I started writing a reply with that. So, the summary (ROM >>>> position depends on header type): >>>> if ( (reg == PCI_COMMAND) || (reg == ROM) ) >>>> { >>>> read PCI_COMMAND and see if memory or IO decoding are enabled. >>>> if ( enabled ) >>>> write_lock(d->vpci_lock) >>>> else >>>> read_lock(d->vpci_lock) >>>> } >>> Hmm, yes, you can actually get away without using "size", since both >>> command register and ROM BAR are 32-bit aligned registers, and 64-bit >>> accesses get split in vpci_ecam_write(). >> But, OS may want reading a single byte of ROM BAR, so I think >> I'll need to check if reg+size fall into PCI_COMAND and ROM BAR >> ranges >>> For the command register the memory- / IO-decoding-enabled check may >>> end up a little more complicated, as the value to be written also >>> matters. Maybe read the command register only for the ROM BAR write, >>> using the write lock uniformly for all command register writes? >> Sounds good for the start. >> Another concern is that if we go with a read_lock and then in the >> underlying code we disable memory decoding and try doing >> something and calling cmd_write handler for any reason then.... >> >> I mean that the check in the vpci_write is somewhat we can tolerate, >> but then it is must be considered that no code in the read path >> is allowed to perform write path functions. Which brings a pretty >> valid use-case: say in read mode we detect an unrecoverable error >> and need to remove the device: >> vpci_process_pending -> ERROR -> vpci_remove_device or similar. >> >> What do we do then? It is all going to be fragile... > Real hardware won't cause a device to disappear upon a problem with > a read access. There shouldn't be any need to remove a passed-through > device either; such problems (if any) need handling differently imo. Yes, at the moment there is a single place in the code which removes the device (besides normal use-cases such as pci_add_device on fail path and PHYSDEVOP_manage_pci_remove): bool vpci_process_pending(struct vcpu *v) { [snip] if ( rc ) /* * FIXME: in case of failure remove the device from the domain. * Note that there might still be leftover mappings. While this is * safe for Dom0, for DomUs the domain will likely need to be * killed in order to avoid leaking stale p2m mappings on * failure. */ vpci_remove_device(v->vpci.pdev); > > Jan > >
On 08.02.22 10:57, Jan Beulich wrote: > On 08.02.2022 08:35, Oleksandr Andrushchenko wrote: >> 1.1. Semi read lock upgrade in modify bars >> -------------------------------------------------------------- >> In this case both vpci_read and vpci_write take a read lock and when it comes >> to modify_bars: >> >> 1. read_unlock(d->vpci_lock) >> 2. write_lock(d->vpci_lock) >> 3. Check that pdev->vpci is still available and is the same object: >> if (pdev->vpci && (pdev->vpci == old_vpci) ) >> { >> /* vpci structure is valid and can be used. */ >> } >> else >> { >> /* vpci has gone, return an error. */ >> } >> >> Pros: >> - no per-device vpci lock is needed? >> - solves overlap code ABBA in modify_bars >> - readers and writers are NOT serialized >> - NO need to carefully select read paths, so they are guaranteed not to lead >> to lock upgrade use-cases >> >> Cons: >> - ??? > The "pdev->vpci == old_vpci" is fragile: The struct may have got re- > allocated, and it just so happened that the two pointers are identical. > > Same then for the subsequent variant 2. Yes, it is possible. We can add an ID number to pdev->vpci, so each new allocated vpci structure has a unique ID which can be used to compare vpci structures. It can be something like pdev->vpci->id = d->vpci_id++; with id being uint32_t for example > > Jan > >
On Mon, Feb 07, 2022 at 05:37:49PM +0100, Jan Beulich wrote: > On 07.02.2022 17:21, Oleksandr Andrushchenko wrote: > > > > > > On 07.02.22 18:15, Jan Beulich wrote: > >> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: > >>> On 07.02.22 17:26, Jan Beulich wrote: > >>>> 1b. Make vpci_write use write lock for writes to command register and BARs > >>>> only; keep using the read lock for all other writes. > >>> I am not quite sure how to do that. Do you mean something like: > >>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, > >>> uint32_t data) > >>> [snip] > >>> list_for_each_entry ( r, &pdev->vpci->handlers, node ) > >>> { > >>> [snip] > >>> if ( r->needs_write_lock) > >>> write_lock(d->vpci_lock) > >>> else > >>> read_lock(d->vpci_lock) > >>> .... > >>> > >>> And provide rw as an argument to: > >>> > >>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, > >>> vpci_write_t *write_handler, unsigned int offset, > >>> unsigned int size, void *data, --->>> bool write_path <<<-----) > >>> > >>> Is this what you mean? > >> This sounds overly complicated. You can derive locally in vpci_write(), > >> from just its "reg" and "size" parameters, whether the lock needs taking > >> in write mode. > > Yes, I started writing a reply with that. So, the summary (ROM > > position depends on header type): > > if ( (reg == PCI_COMMAND) || (reg == ROM) ) > > { > > read PCI_COMMAND and see if memory or IO decoding are enabled. > > if ( enabled ) > > write_lock(d->vpci_lock) > > else > > read_lock(d->vpci_lock) > > } > > Hmm, yes, you can actually get away without using "size", since both > command register and ROM BAR are 32-bit aligned registers, and 64-bit > accesses get split in vpci_ecam_write(). > > For the command register the memory- / IO-decoding-enabled check may > end up a little more complicated, as the value to be written also > matters. Maybe read the command register only for the ROM BAR write, > using the write lock uniformly for all command register writes? > > > Do you also think we can drop pdev->vpci (or currently pdev->vpci->lock) > > at all then? > > I haven't looked at this in any detail, sorry. It sounds possible, > yes. AFAICT you should avoid taking the per-device vpci lock when you take the per-domain lock in write mode. Otherwise you still need the per-device vpci lock in order to keep consistency between concurrent accesses to the device registers. Thanks, Roger.
On 08.02.22 12:11, Roger Pau Monné wrote: > On Mon, Feb 07, 2022 at 05:37:49PM +0100, Jan Beulich wrote: >> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote: >>> >>> On 07.02.22 18:15, Jan Beulich wrote: >>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: >>>>> On 07.02.22 17:26, Jan Beulich wrote: >>>>>> 1b. Make vpci_write use write lock for writes to command register and BARs >>>>>> only; keep using the read lock for all other writes. >>>>> I am not quite sure how to do that. Do you mean something like: >>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, >>>>> uint32_t data) >>>>> [snip] >>>>> list_for_each_entry ( r, &pdev->vpci->handlers, node ) >>>>> { >>>>> [snip] >>>>> if ( r->needs_write_lock) >>>>> write_lock(d->vpci_lock) >>>>> else >>>>> read_lock(d->vpci_lock) >>>>> .... >>>>> >>>>> And provide rw as an argument to: >>>>> >>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, >>>>> vpci_write_t *write_handler, unsigned int offset, >>>>> unsigned int size, void *data, --->>> bool write_path <<<-----) >>>>> >>>>> Is this what you mean? >>>> This sounds overly complicated. You can derive locally in vpci_write(), >>>> from just its "reg" and "size" parameters, whether the lock needs taking >>>> in write mode. >>> Yes, I started writing a reply with that. So, the summary (ROM >>> position depends on header type): >>> if ( (reg == PCI_COMMAND) || (reg == ROM) ) >>> { >>> read PCI_COMMAND and see if memory or IO decoding are enabled. >>> if ( enabled ) >>> write_lock(d->vpci_lock) >>> else >>> read_lock(d->vpci_lock) >>> } >> Hmm, yes, you can actually get away without using "size", since both >> command register and ROM BAR are 32-bit aligned registers, and 64-bit >> accesses get split in vpci_ecam_write(). >> >> For the command register the memory- / IO-decoding-enabled check may >> end up a little more complicated, as the value to be written also >> matters. Maybe read the command register only for the ROM BAR write, >> using the write lock uniformly for all command register writes? >> >>> Do you also think we can drop pdev->vpci (or currently pdev->vpci->lock) >>> at all then? >> I haven't looked at this in any detail, sorry. It sounds possible, >> yes. > AFAICT you should avoid taking the per-device vpci lock when you take > the per-domain lock in write mode. Otherwise you still need the > per-device vpci lock in order to keep consistency between concurrent > accesses to the device registers. I have sent an e-mail this morning describing possible locking schemes. Could we please move there and continue if you don't mind? > > Thanks, Roger. Thank you in advance, Oleksandr
On Tue, Feb 08, 2022 at 07:35:34AM +0000, Oleksandr Andrushchenko wrote: > > > On 07.02.22 18:44, Oleksandr Andrushchenko wrote: > > > > On 07.02.22 18:37, Jan Beulich wrote: > >> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote: > >>> On 07.02.22 18:15, Jan Beulich wrote: > >>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: > >>>>> On 07.02.22 17:26, Jan Beulich wrote: > >>>>>> 1b. Make vpci_write use write lock for writes to command register and BARs > >>>>>> only; keep using the read lock for all other writes. > >>>>> I am not quite sure how to do that. Do you mean something like: > >>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, > >>>>> uint32_t data) > >>>>> [snip] > >>>>> list_for_each_entry ( r, &pdev->vpci->handlers, node ) > >>>>> { > >>>>> [snip] > >>>>> if ( r->needs_write_lock) > >>>>> write_lock(d->vpci_lock) > >>>>> else > >>>>> read_lock(d->vpci_lock) > >>>>> .... > >>>>> > >>>>> And provide rw as an argument to: > >>>>> > >>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, > >>>>> vpci_write_t *write_handler, unsigned int offset, > >>>>> unsigned int size, void *data, --->>> bool write_path <<<-----) > >>>>> > >>>>> Is this what you mean? > >>>> This sounds overly complicated. You can derive locally in vpci_write(), > >>>> from just its "reg" and "size" parameters, whether the lock needs taking > >>>> in write mode. > >>> Yes, I started writing a reply with that. So, the summary (ROM > >>> position depends on header type): > >>> if ( (reg == PCI_COMMAND) || (reg == ROM) ) > >>> { > >>> read PCI_COMMAND and see if memory or IO decoding are enabled. > >>> if ( enabled ) > >>> write_lock(d->vpci_lock) > >>> else > >>> read_lock(d->vpci_lock) > >>> } > >> Hmm, yes, you can actually get away without using "size", since both > >> command register and ROM BAR are 32-bit aligned registers, and 64-bit > >> accesses get split in vpci_ecam_write(). > > But, OS may want reading a single byte of ROM BAR, so I think > > I'll need to check if reg+size fall into PCI_COMAND and ROM BAR > > ranges > >> For the command register the memory- / IO-decoding-enabled check may > >> end up a little more complicated, as the value to be written also > >> matters. Maybe read the command register only for the ROM BAR write, > >> using the write lock uniformly for all command register writes? > > Sounds good for the start. > > Another concern is that if we go with a read_lock and then in the > > underlying code we disable memory decoding and try doing > > something and calling cmd_write handler for any reason then.... > > > > I mean that the check in the vpci_write is somewhat we can tolerate, > > but then it is must be considered that no code in the read path > > is allowed to perform write path functions. Which brings a pretty > > valid use-case: say in read mode we detect an unrecoverable error > > and need to remove the device: > > vpci_process_pending -> ERROR -> vpci_remove_device or similar. > > > > What do we do then? It is all going to be fragile... > I have tried to summarize the options we have wrt locking > and would love to hear from @Roger and @Jan. > > In every variant there is a task of dealing with the overlap > detection in modify_bars, so this is the only place as of now > which needs special treatment. > > Existing limitations: there is no way to upgrade a read lock to a write > lock, so paths which may require write lock protection need to use > write lock from the very beginning. Workarounds can be applied. > > 1. Per-domain rw lock, aka d->vpci_lock > ============================================================== > Note: with per-domain rw lock it is possible to do without introducing > per-device locks, so pdev->vpci->lock can be removed and no pdev->vpci_lock > should be required. Er, no, I think you still need a per-device lock unless you intent to take the per-domain rwlock in write mode every time you modify data in vpci. I still think you need pdev->vpci->lock. It's possible this approach doesn't require moving the lock outside of the vpci struct. > This is only going to work in case if vpci_write always takes the write lock > and vpci_read takes a read lock and no path in vpci_read is allowed to > perform write path operations. I think that's likely too strong? You could get away with both vpci_{read,write} only taking the read lock and use a per-device vpci lock? Otherwise you are likely to introduce contention in msix_write if a guest makes heavy use of the MSI-X entry mask bit. > vpci_process_pending uses write lock as it have vpci_remove_device in its > error path. > > Pros: > - no per-device vpci lock is needed? > - solves overlap code ABBA in modify_bars > > Cons: > - all writes are serialized > - need to carefully select read paths, so they are guaranteed not to lead > to lock upgrade use-cases > > 1.1. Semi read lock upgrade in modify bars > -------------------------------------------------------------- > In this case both vpci_read and vpci_write take a read lock and when it comes > to modify_bars: > > 1. read_unlock(d->vpci_lock) > 2. write_lock(d->vpci_lock) > 3. Check that pdev->vpci is still available and is the same object: > if (pdev->vpci && (pdev->vpci == old_vpci) ) > { > /* vpci structure is valid and can be used. */ > } > else > { > /* vpci has gone, return an error. */ > } > > Pros: > - no per-device vpci lock is needed? > - solves overlap code ABBA in modify_bars > - readers and writers are NOT serialized > - NO need to carefully select read paths, so they are guaranteed not to lead > to lock upgrade use-cases > > Cons: > - ??? > > 2. per-device lock (pdev->vpci_lock) + d->overlap_chk_lock > ============================================================== > In order to solve overlap ABBA, we introduce a per-domain helper > lock to protect the overlapping code in modify_bars: > > old_vpci = pdev->vpci; > spin_unlock(pdev->vpci_lock); > spin_lock(pdev->domain->overlap_chk_lock); Since you drop the pdev lock you get a window here where either vpci or even pdev itself could be removed under your feet, so using pdev->vpci_lock like you do below could dereference a stale pdev. > spin_lock(pdev->vpci_lock); > if ( pdev->vpci && (pdev->vpci == old_vpci) ) > for_each_pdev ( pdev->domain, tmp ) > { > if ( tmp != pdev ) > { > spin_lock(tmp->vpci_lock); > if ( tmp->vpci ) > ... > } > } > > Pros: > - all accesses are independent, only the same device access is serialized > - no need to care about readers and writers wrt read lock upgrade issues > > Cons: > - helper spin lock > > 3. Move overlap detection into process pending > ============================================================== > There is a Roger's patch [1] which adds a possibility for vpci_process_pending > to perform different tasks rather than just map/unmap. With this patch extended > in a way that it can hold a request queue it is possible to delay execution > of the overlap code until no pdev->vpci_lock is held, but before returning to > a guest after vpci_{read|write} or similar. > > Pros: > - no need to emulate read lock upgrade > - fully parallel read/write > - queue in the vpci_process_pending will later on be used by SR-IOV, > so this is going to help the future code > Cons: > - ??? Maybe? It's hard to devise how that would end up looking like, and whether it won't still require such kind of double locking. We would still need to prevent doing a rangeset_remove_range for the device we are trying to setup the mapping for, at which point we still need to lock the current device plus the device we are iterating against? Since the code in vpci_process_pending is always executed in guest vCPU context requiring all guest vCPUs to be paused when doing a device addition or removal would prevent devices from going away, but we could still have issues with concurrent accesses from other vCPUs. > > 4. Re-write overlap detection code > ============================================================== > It is possible to re-write overlap detection code, so the information about the > mapped/unmapped regions is not read from vpci->header->bars[i] of each device, > but instead there is a per-domain structure which holds the regions and > implements reference counting. > > Pros: > - solves ABBA > > Cons: > - very complex code is expected > > 5. You name it > ============================================================== > > From all the above I would recommend we go with option 2 which seems to reliably > solve ABBA and does not bring cons of the other approaches. 6. per-domain rwlock + per-device vpci lock Introduce vpci_header_write_lock(start, {end, size}) helper: return whether a range requires the per-domain lock in write mode. This will only return true if the range overlaps with the BAR ROM or the command register. In vpci_{read,write}: if ( vpci_header_write_lock(...) ) /* Gain exclusive access to all of the domain pdevs vpci. */ write_lock(d->vpci); else { read_lock(d->vpci); spin_lock(vpci->lock); } ... The vpci assign/deassign functions would need to be modified to write lock the per-domain rwlock. The MSI-X table MMIO handler will also need to read lock the per domain vpci lock. I think it's either something along the lines of my suggestion above, or maybe option 3, albeit you would have to investigate how to implement option 3. Thanks, Roger.
On 08.02.22 12:50, Roger Pau Monné wrote: > On Tue, Feb 08, 2022 at 07:35:34AM +0000, Oleksandr Andrushchenko wrote: >> >> On 07.02.22 18:44, Oleksandr Andrushchenko wrote: >>> On 07.02.22 18:37, Jan Beulich wrote: >>>> On 07.02.2022 17:21, Oleksandr Andrushchenko wrote: >>>>> On 07.02.22 18:15, Jan Beulich wrote: >>>>>> On 07.02.2022 17:07, Oleksandr Andrushchenko wrote: >>>>>>> On 07.02.22 17:26, Jan Beulich wrote: >>>>>>>> 1b. Make vpci_write use write lock for writes to command register and BARs >>>>>>>> only; keep using the read lock for all other writes. >>>>>>> I am not quite sure how to do that. Do you mean something like: >>>>>>> void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, >>>>>>> uint32_t data) >>>>>>> [snip] >>>>>>> list_for_each_entry ( r, &pdev->vpci->handlers, node ) >>>>>>> { >>>>>>> [snip] >>>>>>> if ( r->needs_write_lock) >>>>>>> write_lock(d->vpci_lock) >>>>>>> else >>>>>>> read_lock(d->vpci_lock) >>>>>>> .... >>>>>>> >>>>>>> And provide rw as an argument to: >>>>>>> >>>>>>> int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, >>>>>>> vpci_write_t *write_handler, unsigned int offset, >>>>>>> unsigned int size, void *data, --->>> bool write_path <<<-----) >>>>>>> >>>>>>> Is this what you mean? >>>>>> This sounds overly complicated. You can derive locally in vpci_write(), >>>>>> from just its "reg" and "size" parameters, whether the lock needs taking >>>>>> in write mode. >>>>> Yes, I started writing a reply with that. So, the summary (ROM >>>>> position depends on header type): >>>>> if ( (reg == PCI_COMMAND) || (reg == ROM) ) >>>>> { >>>>> read PCI_COMMAND and see if memory or IO decoding are enabled. >>>>> if ( enabled ) >>>>> write_lock(d->vpci_lock) >>>>> else >>>>> read_lock(d->vpci_lock) >>>>> } >>>> Hmm, yes, you can actually get away without using "size", since both >>>> command register and ROM BAR are 32-bit aligned registers, and 64-bit >>>> accesses get split in vpci_ecam_write(). >>> But, OS may want reading a single byte of ROM BAR, so I think >>> I'll need to check if reg+size fall into PCI_COMAND and ROM BAR >>> ranges >>>> For the command register the memory- / IO-decoding-enabled check may >>>> end up a little more complicated, as the value to be written also >>>> matters. Maybe read the command register only for the ROM BAR write, >>>> using the write lock uniformly for all command register writes? >>> Sounds good for the start. >>> Another concern is that if we go with a read_lock and then in the >>> underlying code we disable memory decoding and try doing >>> something and calling cmd_write handler for any reason then.... >>> >>> I mean that the check in the vpci_write is somewhat we can tolerate, >>> but then it is must be considered that no code in the read path >>> is allowed to perform write path functions. Which brings a pretty >>> valid use-case: say in read mode we detect an unrecoverable error >>> and need to remove the device: >>> vpci_process_pending -> ERROR -> vpci_remove_device or similar. >>> >>> What do we do then? It is all going to be fragile... >> I have tried to summarize the options we have wrt locking >> and would love to hear from @Roger and @Jan. >> >> In every variant there is a task of dealing with the overlap >> detection in modify_bars, so this is the only place as of now >> which needs special treatment. >> >> Existing limitations: there is no way to upgrade a read lock to a write >> lock, so paths which may require write lock protection need to use >> write lock from the very beginning. Workarounds can be applied. >> >> 1. Per-domain rw lock, aka d->vpci_lock >> ============================================================== >> Note: with per-domain rw lock it is possible to do without introducing >> per-device locks, so pdev->vpci->lock can be removed and no pdev->vpci_lock >> should be required. > Er, no, I think you still need a per-device lock unless you intent to > take the per-domain rwlock in write mode every time you modify data > in vpci. This is exactly the assumption stated below. I am trying to discuss all the possible options, so this one is also listed > I still think you need pdev->vpci->lock. It's possible this > approach doesn't require moving the lock outside of the vpci struct. > >> This is only going to work in case if vpci_write always takes the write lock >> and vpci_read takes a read lock and no path in vpci_read is allowed to >> perform write path operations. > I think that's likely too strong? > > You could get away with both vpci_{read,write} only taking the read > lock and use a per-device vpci lock? But as discussed before: - if pdev->vpci_lock is used this still leads to ABBA - we should know about if to take the write lock beforehand > > Otherwise you are likely to introduce contention in msix_write if a > guest makes heavy use of the MSI-X entry mask bit. > >> vpci_process_pending uses write lock as it have vpci_remove_device in its >> error path. >> >> Pros: >> - no per-device vpci lock is needed? >> - solves overlap code ABBA in modify_bars >> >> Cons: >> - all writes are serialized >> - need to carefully select read paths, so they are guaranteed not to lead >> to lock upgrade use-cases >> >> 1.1. Semi read lock upgrade in modify bars >> -------------------------------------------------------------- >> In this case both vpci_read and vpci_write take a read lock and when it comes >> to modify_bars: >> >> 1. read_unlock(d->vpci_lock) >> 2. write_lock(d->vpci_lock) >> 3. Check that pdev->vpci is still available and is the same object: >> if (pdev->vpci && (pdev->vpci == old_vpci) ) >> { >> /* vpci structure is valid and can be used. */ >> } >> else >> { >> /* vpci has gone, return an error. */ >> } >> >> Pros: >> - no per-device vpci lock is needed? >> - solves overlap code ABBA in modify_bars >> - readers and writers are NOT serialized >> - NO need to carefully select read paths, so they are guaranteed not to lead >> to lock upgrade use-cases >> >> Cons: >> - ??? >> >> 2. per-device lock (pdev->vpci_lock) + d->overlap_chk_lock >> ============================================================== >> In order to solve overlap ABBA, we introduce a per-domain helper >> lock to protect the overlapping code in modify_bars: >> >> old_vpci = pdev->vpci; >> spin_unlock(pdev->vpci_lock); >> spin_lock(pdev->domain->overlap_chk_lock); > Since you drop the pdev lock you get a window here where either vpci > or even pdev itself could be removed under your feet, so using > pdev->vpci_lock like you do below could dereference a stale pdev. pdev is anyways not protected with pcidevs lock here, so even now it is possible to have pdev disapear in between. We do not use pcidevs_lock in MMIO handlers... > >> spin_lock(pdev->vpci_lock); >> if ( pdev->vpci && (pdev->vpci == old_vpci) ) >> for_each_pdev ( pdev->domain, tmp ) >> { >> if ( tmp != pdev ) >> { >> spin_lock(tmp->vpci_lock); >> if ( tmp->vpci ) >> ... >> } >> } >> >> Pros: >> - all accesses are independent, only the same device access is serialized >> - no need to care about readers and writers wrt read lock upgrade issues >> >> Cons: >> - helper spin lock >> >> 3. Move overlap detection into process pending >> ============================================================== >> There is a Roger's patch [1] which adds a possibility for vpci_process_pending >> to perform different tasks rather than just map/unmap. With this patch extended >> in a way that it can hold a request queue it is possible to delay execution >> of the overlap code until no pdev->vpci_lock is held, but before returning to >> a guest after vpci_{read|write} or similar. >> >> Pros: >> - no need to emulate read lock upgrade >> - fully parallel read/write >> - queue in the vpci_process_pending will later on be used by SR-IOV, >> so this is going to help the future code >> Cons: >> - ??? > Maybe? It's hard to devise how that would end up looking like, and > whether it won't still require such kind of double locking. We would > still need to prevent doing a rangeset_remove_range for the device we > are trying to setup the mapping for, at which point we still need to > lock the current device plus the device we are iterating against? > > Since the code in vpci_process_pending is always executed in guest > vCPU context requiring all guest vCPUs to be paused when doing a > device addition or removal would prevent devices from going away, but > we could still have issues with concurrent accesses from other vCPUs. Yes, I understand that this may not be easily done, but this is still an option, > >> 4. Re-write overlap detection code >> ============================================================== >> It is possible to re-write overlap detection code, so the information about the >> mapped/unmapped regions is not read from vpci->header->bars[i] of each device, >> but instead there is a per-domain structure which holds the regions and >> implements reference counting. >> >> Pros: >> - solves ABBA >> >> Cons: >> - very complex code is expected >> >> 5. You name it >> ============================================================== >> >> From all the above I would recommend we go with option 2 which seems to reliably >> solve ABBA and does not bring cons of the other approaches. > 6. per-domain rwlock + per-device vpci lock > > Introduce vpci_header_write_lock(start, {end, size}) helper: return > whether a range requires the per-domain lock in write mode. This will > only return true if the range overlaps with the BAR ROM or the command > register. > > In vpci_{read,write}: > > if ( vpci_header_write_lock(...) ) > /* Gain exclusive access to all of the domain pdevs vpci. */ > write_lock(d->vpci); > else > { > read_lock(d->vpci); > spin_lock(vpci->lock); > } > ... > > The vpci assign/deassign functions would need to be modified to write > lock the per-domain rwlock. The MSI-X table MMIO handler will also > need to read lock the per domain vpci lock. Ok, so it seems you are in favor of this implementation and I have no objection as well. The only limitation we should be aware of is that once a path has acquired the read lock it is not possible to do any write path operations in there. vpci_process_pending will acquire write lock though as it can lead to vpci_remove_device on its error path. So, I am going to implement pdev->vpci->lock + d->vpci_lock > > I think it's either something along the lines of my suggestion above, > or maybe option 3, albeit you would have to investigate how to > implement option 3. > > Thanks, Roger. @Roger, @Jan! Thank you!!
On Tue, Feb 08, 2022 at 11:13:41AM +0000, Oleksandr Andrushchenko wrote: > > > On 08.02.22 12:50, Roger Pau Monné wrote: > > On Tue, Feb 08, 2022 at 07:35:34AM +0000, Oleksandr Andrushchenko wrote: > >> 5. You name it > >> ============================================================== > >> > >> From all the above I would recommend we go with option 2 which seems to reliably > >> solve ABBA and does not bring cons of the other approaches. > > 6. per-domain rwlock + per-device vpci lock > > > > Introduce vpci_header_write_lock(start, {end, size}) helper: return > > whether a range requires the per-domain lock in write mode. This will > > only return true if the range overlaps with the BAR ROM or the command > > register. > > > > In vpci_{read,write}: > > > > if ( vpci_header_write_lock(...) ) > > /* Gain exclusive access to all of the domain pdevs vpci. */ > > write_lock(d->vpci); > > else > > { > > read_lock(d->vpci); > > spin_lock(vpci->lock); > > } > > ... > > > > The vpci assign/deassign functions would need to be modified to write > > lock the per-domain rwlock. The MSI-X table MMIO handler will also > > need to read lock the per domain vpci lock. > Ok, so it seems you are in favor of this implementation and I have > no objection as well. The only limitation we should be aware of is > that once a path has acquired the read lock it is not possible to do > any write path operations in there. > vpci_process_pending will acquire write lock though as it can > lead to vpci_remove_device on its error path. > > So, I am going to implement pdev->vpci->lock + d->vpci_lock I think it's the less uncertain option. As said, if you want to investigate whether you can successfully move the checking into vpci_process_pending that would also be fine with me, but I cannot assert it's going to be successful. OTOH I think the per-domain rwlock + per-device spinlock seems quite likely to solve our issues. Thanks, Roger.
On 08.02.22 15:38, Roger Pau Monné wrote: > On Tue, Feb 08, 2022 at 11:13:41AM +0000, Oleksandr Andrushchenko wrote: >> >> On 08.02.22 12:50, Roger Pau Monné wrote: >>> On Tue, Feb 08, 2022 at 07:35:34AM +0000, Oleksandr Andrushchenko wrote: >>>> 5. You name it >>>> ============================================================== >>>> >>>> From all the above I would recommend we go with option 2 which seems to reliably >>>> solve ABBA and does not bring cons of the other approaches. >>> 6. per-domain rwlock + per-device vpci lock >>> >>> Introduce vpci_header_write_lock(start, {end, size}) helper: return >>> whether a range requires the per-domain lock in write mode. This will >>> only return true if the range overlaps with the BAR ROM or the command >>> register. >>> >>> In vpci_{read,write}: >>> >>> if ( vpci_header_write_lock(...) ) >>> /* Gain exclusive access to all of the domain pdevs vpci. */ >>> write_lock(d->vpci); >>> else >>> { >>> read_lock(d->vpci); >>> spin_lock(vpci->lock); >>> } >>> ... >>> >>> The vpci assign/deassign functions would need to be modified to write >>> lock the per-domain rwlock. The MSI-X table MMIO handler will also >>> need to read lock the per domain vpci lock. >> Ok, so it seems you are in favor of this implementation and I have >> no objection as well. The only limitation we should be aware of is >> that once a path has acquired the read lock it is not possible to do >> any write path operations in there. >> vpci_process_pending will acquire write lock though as it can >> lead to vpci_remove_device on its error path. >> >> So, I am going to implement pdev->vpci->lock + d->vpci_lock > I think it's the less uncertain option. > > As said, if you want to investigate whether you can successfully move > the checking into vpci_process_pending that would also be fine with > me, but I cannot assert it's going to be successful. OTOH I think the > per-domain rwlock + per-device spinlock seems quite likely to solve > our issues. Ok, then I'll go with per-domain rwlock + per-device spinlock and write lock in vpci_write for cmd + ROM. Of course other places such as vpci_remove_device and vpci_process_pending will use write lock > > Thanks, Roger. > Thank you, Oleksandr
diff --git a/tools/tests/vpci/emul.h b/tools/tests/vpci/emul.h index 2e1d3057c9d8..d018fb5eef21 100644 --- a/tools/tests/vpci/emul.h +++ b/tools/tests/vpci/emul.h @@ -44,6 +44,7 @@ struct domain { }; struct pci_dev { + bool vpci_lock; struct vpci *vpci; }; @@ -53,10 +54,8 @@ struct vcpu }; extern const struct vcpu *current; -extern const struct pci_dev test_pdev; +extern struct pci_dev test_pdev; -typedef bool spinlock_t; -#define spin_lock_init(l) (*(l) = false) #define spin_lock(l) (*(l) = true) #define spin_unlock(l) (*(l) = false) diff --git a/tools/tests/vpci/main.c b/tools/tests/vpci/main.c index b9a0a6006bb9..3b86ed232eb1 100644 --- a/tools/tests/vpci/main.c +++ b/tools/tests/vpci/main.c @@ -23,7 +23,7 @@ static struct vpci vpci; const static struct domain d; -const struct pci_dev test_pdev = { +struct pci_dev test_pdev = { .vpci = &vpci, }; @@ -158,7 +158,6 @@ main(int argc, char **argv) int rc; INIT_LIST_HEAD(&vpci.handlers); - spin_lock_init(&vpci.lock); VPCI_ADD_REG(vpci_read32, vpci_write32, 0, 4, r0); VPCI_READ_CHECK(0, 4, r0); diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c index 13e2a190b439..1f7a37f78264 100644 --- a/xen/arch/x86/hvm/vmsi.c +++ b/xen/arch/x86/hvm/vmsi.c @@ -910,14 +910,14 @@ int vpci_msix_arch_print(const struct vpci_msix *msix) { struct pci_dev *pdev = msix->pdev; - spin_unlock(&msix->pdev->vpci->lock); + spin_unlock(&msix->pdev->vpci_lock); process_pending_softirqs(); /* NB: we assume that pdev cannot go away for an alive domain. */ - if ( !pdev->vpci || !spin_trylock(&pdev->vpci->lock) ) + if ( !spin_trylock(&pdev->vpci_lock) ) return -EBUSY; - if ( pdev->vpci->msix != msix ) + if ( !pdev->vpci || pdev->vpci->msix != msix ) { - spin_unlock(&pdev->vpci->lock); + spin_unlock(&pdev->vpci_lock); return -EAGAIN; } } diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c index e8b09d77d880..50dec3bb73d0 100644 --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -397,6 +397,7 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) *((u8*) &pdev->bus) = bus; *((u8*) &pdev->devfn) = devfn; pdev->domain = NULL; + spin_lock_init(&pdev->vpci_lock); arch_pci_init_pdev(pdev); diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c index 40ff79c33f8f..bd23c0274d48 100644 --- a/xen/drivers/vpci/header.c +++ b/xen/drivers/vpci/header.c @@ -142,12 +142,13 @@ bool vpci_process_pending(struct vcpu *v) if ( rc == -ERESTART ) return true; - spin_lock(&v->vpci.pdev->vpci->lock); - /* Disable memory decoding unconditionally on failure. */ - modify_decoding(v->vpci.pdev, - rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd, - !rc && v->vpci.rom_only); - spin_unlock(&v->vpci.pdev->vpci->lock); + spin_lock(&v->vpci.pdev->vpci_lock); + if ( v->vpci.pdev->vpci ) + /* Disable memory decoding unconditionally on failure. */ + modify_decoding(v->vpci.pdev, + rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd, + !rc && v->vpci.rom_only); + spin_unlock(&v->vpci.pdev->vpci_lock); rangeset_destroy(v->vpci.mem); v->vpci.mem = NULL; @@ -285,6 +286,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) continue; } + spin_lock(&tmp->vpci_lock); + if ( !tmp->vpci ) + { + spin_unlock(&tmp->vpci_lock); + continue; + } for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ ) { const struct vpci_bar *bar = &tmp->vpci->header.bars[i]; @@ -303,12 +310,14 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) rc = rangeset_remove_range(mem, start, end); if ( rc ) { + spin_unlock(&tmp->vpci_lock); printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n", start, end, rc); rangeset_destroy(mem); return rc; } } + spin_unlock(&tmp->vpci_lock); } ASSERT(dev); diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c index 5757a7aed20f..e3ce46869dad 100644 --- a/xen/drivers/vpci/msi.c +++ b/xen/drivers/vpci/msi.c @@ -270,7 +270,7 @@ void vpci_dump_msi(void) rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) { - const struct pci_dev *pdev; + struct pci_dev *pdev; if ( !has_vpci(d) ) continue; @@ -282,8 +282,13 @@ void vpci_dump_msi(void) const struct vpci_msi *msi; const struct vpci_msix *msix; - if ( !pdev->vpci || !spin_trylock(&pdev->vpci->lock) ) + if ( !spin_trylock(&pdev->vpci_lock) ) continue; + if ( !pdev->vpci ) + { + spin_unlock(&pdev->vpci_lock); + continue; + } msi = pdev->vpci->msi; if ( msi && msi->enabled ) @@ -323,7 +328,7 @@ void vpci_dump_msi(void) } } - spin_unlock(&pdev->vpci->lock); + spin_unlock(&pdev->vpci_lock); process_pending_softirqs(); } } diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c index 846f1b8d7038..d1dbfc6e0ffd 100644 --- a/xen/drivers/vpci/msix.c +++ b/xen/drivers/vpci/msix.c @@ -138,7 +138,7 @@ static void control_write(const struct pci_dev *pdev, unsigned int reg, pci_conf_write16(pdev->sbdf, reg, val); } -static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr) +static struct vpci_msix *msix_get(const struct domain *d, unsigned long addr) { struct vpci_msix *msix; @@ -150,15 +150,29 @@ static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr) for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ ) if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled && VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) ) + { + spin_lock(&msix->pdev->vpci_lock); return msix; + } } return NULL; } +static void msix_put(struct vpci_msix *msix) +{ + if ( !msix ) + return; + + spin_unlock(&msix->pdev->vpci_lock); +} + static int msix_accept(struct vcpu *v, unsigned long addr) { - return !!msix_find(v->domain, addr); + struct vpci_msix *msix = msix_get(v->domain, addr); + + msix_put(msix); + return !!msix; } static bool access_allowed(const struct pci_dev *pdev, unsigned long addr, @@ -186,7 +200,7 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, unsigned long *data) { const struct domain *d = v->domain; - struct vpci_msix *msix = msix_find(d, addr); + struct vpci_msix *msix = msix_get(d, addr); const struct vpci_msix_entry *entry; unsigned int offset; @@ -196,7 +210,10 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, return X86EMUL_RETRY; if ( !access_allowed(msix->pdev, addr, len) ) + { + msix_put(msix); return X86EMUL_OKAY; + } if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) ) { @@ -222,10 +239,10 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, break; } + msix_put(msix); return X86EMUL_OKAY; } - spin_lock(&msix->pdev->vpci->lock); entry = get_entry(msix, addr); offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); @@ -254,7 +271,8 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, ASSERT_UNREACHABLE(); break; } - spin_unlock(&msix->pdev->vpci->lock); + + msix_put(msix); return X86EMUL_OKAY; } @@ -263,7 +281,7 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len, unsigned long data) { const struct domain *d = v->domain; - struct vpci_msix *msix = msix_find(d, addr); + struct vpci_msix *msix = msix_get(d, addr); struct vpci_msix_entry *entry; unsigned int offset; @@ -271,7 +289,10 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len, return X86EMUL_RETRY; if ( !access_allowed(msix->pdev, addr, len) ) + { + msix_put(msix); return X86EMUL_OKAY; + } if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) ) { @@ -294,10 +315,11 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len, } } + msix_put(msix); + return X86EMUL_OKAY; } - spin_lock(&msix->pdev->vpci->lock); entry = get_entry(msix, addr); offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); @@ -370,7 +392,8 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len, ASSERT_UNREACHABLE(); break; } - spin_unlock(&msix->pdev->vpci->lock); + + msix_put(msix); return X86EMUL_OKAY; } diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c index fb0947179b79..cb2ababa28e3 100644 --- a/xen/drivers/vpci/vpci.c +++ b/xen/drivers/vpci/vpci.c @@ -35,12 +35,10 @@ extern vpci_register_init_t *const __start_vpci_array[]; extern vpci_register_init_t *const __end_vpci_array[]; #define NUM_VPCI_INIT (__end_vpci_array - __start_vpci_array) -void vpci_remove_device(struct pci_dev *pdev) +static void vpci_remove_device_locked(struct pci_dev *pdev) { - if ( !has_vpci(pdev->domain) ) - return; + ASSERT(spin_is_locked(&pdev->vpci_lock)); - spin_lock(&pdev->vpci->lock); while ( !list_empty(&pdev->vpci->handlers) ) { struct vpci_register *r = list_first_entry(&pdev->vpci->handlers, @@ -50,15 +48,26 @@ void vpci_remove_device(struct pci_dev *pdev) list_del(&r->node); xfree(r); } - spin_unlock(&pdev->vpci->lock); xfree(pdev->vpci->msix); xfree(pdev->vpci->msi); xfree(pdev->vpci); pdev->vpci = NULL; } +void vpci_remove_device(struct pci_dev *pdev) +{ + if ( !has_vpci(pdev->domain) ) + return; + + spin_lock(&pdev->vpci_lock); + if ( pdev->vpci ) + vpci_remove_device_locked(pdev); + spin_unlock(&pdev->vpci_lock); +} + int vpci_add_handlers(struct pci_dev *pdev) { + struct vpci *vpci; unsigned int i; int rc = 0; @@ -68,12 +77,14 @@ int vpci_add_handlers(struct pci_dev *pdev) /* We should not get here twice for the same device. */ ASSERT(!pdev->vpci); - pdev->vpci = xzalloc(struct vpci); - if ( !pdev->vpci ) + vpci = xzalloc(struct vpci); + if ( !vpci ) return -ENOMEM; - INIT_LIST_HEAD(&pdev->vpci->handlers); - spin_lock_init(&pdev->vpci->lock); + INIT_LIST_HEAD(&vpci->handlers); + + spin_lock(&pdev->vpci_lock); + pdev->vpci = vpci; for ( i = 0; i < NUM_VPCI_INIT; i++ ) { @@ -83,7 +94,8 @@ int vpci_add_handlers(struct pci_dev *pdev) } if ( rc ) - vpci_remove_device(pdev); + vpci_remove_device_locked(pdev); + spin_unlock(&pdev->vpci_lock); return rc; } @@ -129,6 +141,7 @@ uint32_t vpci_hw_read32(const struct pci_dev *pdev, unsigned int reg, return pci_conf_read32(pdev->sbdf, reg); } +/* Must be called with pdev->vpci_lock held. */ int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, vpci_write_t *write_handler, unsigned int offset, unsigned int size, void *data) @@ -152,8 +165,6 @@ int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, r->offset = offset; r->private = data; - spin_lock(&vpci->lock); - /* The list of handlers must be kept sorted at all times. */ list_for_each ( prev, &vpci->handlers ) { @@ -165,25 +176,23 @@ int vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, break; if ( cmp == 0 ) { - spin_unlock(&vpci->lock); xfree(r); return -EEXIST; } } list_add_tail(&r->node, prev); - spin_unlock(&vpci->lock); return 0; } +/* Must be called with pdev->vpci_lock held. */ int vpci_remove_register(struct vpci *vpci, unsigned int offset, unsigned int size) { const struct vpci_register r = { .offset = offset, .size = size }; struct vpci_register *rm; - spin_lock(&vpci->lock); list_for_each_entry ( rm, &vpci->handlers, node ) { int cmp = vpci_register_cmp(&r, rm); @@ -195,14 +204,12 @@ int vpci_remove_register(struct vpci *vpci, unsigned int offset, if ( !cmp && rm->offset == offset && rm->size == size ) { list_del(&rm->node); - spin_unlock(&vpci->lock); xfree(rm); return 0; } if ( cmp <= 0 ) break; } - spin_unlock(&vpci->lock); return -ENOENT; } @@ -311,7 +318,7 @@ static uint32_t merge_result(uint32_t data, uint32_t new, unsigned int size, uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) { const struct domain *d = current->domain; - const struct pci_dev *pdev; + struct pci_dev *pdev; const struct vpci_register *r; unsigned int data_offset = 0; uint32_t data = ~(uint32_t)0; @@ -327,7 +334,12 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) if ( !pdev ) return vpci_read_hw(sbdf, reg, size); - spin_lock(&pdev->vpci->lock); + spin_lock(&pdev->vpci_lock); + if ( !pdev->vpci ) + { + spin_unlock(&pdev->vpci_lock); + return vpci_read_hw(sbdf, reg, size); + } /* Read from the hardware or the emulated register handlers. */ list_for_each_entry ( r, &pdev->vpci->handlers, node ) @@ -370,7 +382,7 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) break; ASSERT(data_offset < size); } - spin_unlock(&pdev->vpci->lock); + spin_unlock(&pdev->vpci_lock); if ( data_offset < size ) { @@ -414,7 +426,7 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, uint32_t data) { const struct domain *d = current->domain; - const struct pci_dev *pdev; + struct pci_dev *pdev; const struct vpci_register *r; unsigned int data_offset = 0; const unsigned long *ro_map = pci_get_ro_map(sbdf.seg); @@ -440,7 +452,14 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, return; } - spin_lock(&pdev->vpci->lock); + spin_lock(&pdev->vpci_lock); + if ( !pdev->vpci ) + { + spin_unlock(&pdev->vpci_lock); + vpci_write_hw(sbdf, reg, size, data); + return; + } + /* Write the value to the hardware or emulated registers. */ list_for_each_entry ( r, &pdev->vpci->handlers, node ) @@ -475,7 +494,7 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, break; ASSERT(data_offset < size); } - spin_unlock(&pdev->vpci->lock); + spin_unlock(&pdev->vpci_lock); if ( data_offset < size ) /* Tailing gap, write the remaining. */ diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h index b6d7e454f814..3f60d6c6c6dd 100644 --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -134,6 +134,7 @@ struct pci_dev { u64 vf_rlen[6]; /* Data for vPCI. */ + spinlock_t vpci_lock; struct vpci *vpci; }; diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h index e8ac1eb39513..f2a7d82ce77b 100644 --- a/xen/include/xen/vpci.h +++ b/xen/include/xen/vpci.h @@ -31,7 +31,7 @@ int __must_check vpci_add_handlers(struct pci_dev *dev); /* Remove all handlers and free vpci related structures. */ void vpci_remove_device(struct pci_dev *pdev); -/* Add/remove a register handler. */ +/* Add/remove a register handler. Must be called holding the vpci_lock. */ int __must_check vpci_add_register(struct vpci *vpci, vpci_read_t *read_handler, vpci_write_t *write_handler, @@ -60,7 +60,6 @@ bool __must_check vpci_process_pending(struct vcpu *v); struct vpci { /* List of vPCI handlers for a device. */ struct list_head handlers; - spinlock_t lock; #ifdef __XEN__ /* Hide the rest of the vpci struct from the user-space test harness. */