Message ID | 1488159949-15011-6-git-send-email-chao.gao@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Mar 02, 2017 at 01:58:14AM -0700, Jan Beulich wrote: >>>> On 27.02.17 at 02:45, <chao.gao@intel.com> wrote: >> @@ -547,16 +548,116 @@ static int remap_entry_to_msi_msg( >> return 0; >> } >> >> +/* >> + * This function is a common interface to update irte for msi case. >> + * >> + * If @pi_desc != NULL and @gvec != 0, the IRTE will be updated to a posted >> + * format. In this case, @msg is ignored because constructing a posted format >> + * IRTE doesn't need any information about the msi address or msi data. >> + * >> + * If @pi_desc == NULL and @gvec == 0, the IRTE will be updated to a remapped >> + * format. In this case, @msg can't be NULL. > >This kind of implies that in the other case msg can be NULL. Please >make this explicit or remove the last sentence, to avoid confusing >readers. Plus, if msg can be NULL in that case, why don't you pass >NULL in that case? I choose to make this explicit. > >> + * Assume 'ir_ctrl->iremap_lock' has been acquired and the remap_index >> + * of msi_desc has a benign value. >> + */ >> +static int update_irte_for_msi_common( >> + struct iommu *iommu, const struct pci_dev *pdev, >> + const struct msi_desc *msi_desc, struct msi_msg *msg, >> + const struct pi_desc *pi_desc, const uint8_t gvec) >> +{ >> + struct iremap_entry *iremap_entry = NULL, *iremap_entries; >> + struct iremap_entry new_ire = {{0}}; > >I think just "{ }" will do. > >> + unsigned int index = msi_desc->remap_index; >> + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); >> + >> + ASSERT( ir_ctrl ); >> + ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) ); >> + ASSERT( (index >= 0) && (index < IREMAP_ENTRY_NR) ); > >Stray blanks inside parentheses. > >> + if ( (!pi_desc && gvec) || (pi_desc && !gvec) ) > >gvec == 0 alone is never a valid check: Either all vectors are valid, >or a whole range (e.g. 0x00...0x0f) is invalid. Furthermore I think >such checks are easier to read as either How about only use pi_desc is NULL or not to decide the format of the IRTE? > > if ( !pi_desc != !gvec ) > >or > > if ( pi_desc ? !gvec : gvec ) > >> + return -EINVAL; >> + >> + if ( !pi_desc && !gvec && !msg ) > >With the earlier check the first or second part could be omitted >afaict. Agree > >> + return -EINVAL; >> + >> + GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, >> + iremap_entries, iremap_entry); >> + >> + if ( !pi_desc ) >> + { >> + /* Set interrupt remapping table entry */ > >Again a request for consistency: Either have a respective comment >also at the top of the else branch, or omit the one here too. > >> + new_ire.remap.dm = msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT; >> + new_ire.remap.tm = msg->data >> MSI_DATA_TRIGGER_SHIFT; >> + new_ire.remap.dlm = msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT; >> + /* Hardware require RH = 1 for LPR delivery mode */ >> + new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio); >> + new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & >> + MSI_DATA_VECTOR_MASK; >> + if ( x2apic_enabled ) >> + new_ire.remap.dst = msg->dest32; >> + else >> + new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) >> + & 0xff) << 8; > >Please strive to eliminate literal numbers here. At least the 0xff looks >to be easy to deal with (using MASK_EXTR() together with >MSI_ADDR_DEST_ID_MASK). > >> + new_ire.remap.p = 1; >> + } >> + else >> + { >> + new_ire.post.im = 1; >> + new_ire.post.vector = gvec; >> + new_ire.post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT); >> + new_ire.post.pda_h = virt_to_maddr(pi_desc) >> 32; >> + new_ire.post.p = 1; >> + } >> + >> + if ( pdev ) >> + set_msi_source_id(pdev, &new_ire); >> + else >> + set_hpet_source_id(msi_desc->hpet_id, &new_ire); >> + >> + if ( iremap_entry->val != new_ire.val ) >> + { >> + if ( cpu_has_cx16 ) >> + { >> + __uint128_t ret; >> + struct iremap_entry old_ire; >> + >> + old_ire = *iremap_entry; >> + ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire); >> + >> + /* >> + * In the above, we use cmpxchg16 to atomically update the 128-bit >> + * IRTE, and the hardware cannot update the IRTE behind us, so >> + * the return value of cmpxchg16 should be the same as old_ire. >> + * This ASSERT validate it. >> + */ >> + ASSERT(ret == old_ire.val); >> + } >> + else >> + { > >This wants a comment added explaining the conditions under which >this is safe. Perhaps also one or more ASSERT()s to that effect. Yes, will add an explaination and ASSERT(). > >> + iremap_entry->lo = new_ire.lo; >> + iremap_entry->hi = new_ire.hi; >> + } >> + >> + iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry)); > >sizeof(*iremap_entry) > >> @@ -592,38 +693,33 @@ static int msi_msg_to_remap_entry( >> return -EFAULT; >> } >> >> + /* Get the IRTE's bind relationship with guest from the live IRTE. */ >> GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, >> iremap_entries, iremap_entry); >> - >> - memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry)); >> - >> - /* Set interrupt remapping table entry */ >> - new_ire.remap.fpd = 0; >> - new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1; >> - new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; >> - new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; >> - /* Hardware require RH = 1 for LPR delivery mode */ >> - new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio); >> - new_ire.remap.avail = 0; >> - new_ire.remap.res_1 = 0; >> - new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & >> - MSI_DATA_VECTOR_MASK; >> - new_ire.remap.res_2 = 0; >> - if ( x2apic_enabled ) >> - new_ire.remap.dst = msg->dest32; >> + if ( !iremap_entry->remap.im ) >> + { >> + gvec = 0; >> + pi_desc = NULL; >> + } >> else >> - new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) >> - & 0xff) << 8; >> + { >> + gvec = iremap_entry->post.vector; >> + pi_desc = (void *)((((u64)iremap_entry->post.pda_h) << PDA_LOW_BIT ) >> + + iremap_entry->post.pda_l); >> + } >> + unmap_vtd_domain_page(iremap_entries); > >I don't follow: Why does it matter what the entry currently holds? >As I've pointed out more than once before (mainly to Feng), the >goal ought to be to produce the new entry solely based on what >the intended new state is, i.e. function input and global data. > I think the function introduced by this patch is to produce the new entry solely based on input. If someone wants to produce the new entry, it can call it directly. I want to explain why we read the entry. msi_msg_to_remap_entry() can be called before a msi gets bound to a guest interrupt or after that. If we call the function without realizing the msi has been binded to a guest interrupt, the IRTE would be updated to a remapped format breaking the binding (at least breaking the intention to use VT-d PI). I think this is a possible case in the current code. This patch avoids this case and provides a new function to the callers who are intended to replace a posted format IRTE with a remapped format IRTE. Reading this entry is to get the binding information and use it to update IRTE ( as comments in code, when the IRTE is in posted format, we can suppress the update since the content of IRTE will not change for the binding information hasn't change. and Also if the binding information changed, we should call pi_update_irte ). At this moment, we don't recognize any existing caller of msi_msg_to_remap_entry() needs to update a posted IRTE to a remapped IRTE. If the need emerges, we can expose the common function to the callers. I also want to extend pi_update_irte to replace a posted IRTE to a remapped one,when guest wrongly configurate its msi interrupt. I hope I have made it a little clear. and glad to see your further suggestion to make the code and the description better to enlighten the following readers. >> - if ( pdev ) >> - set_msi_source_id(pdev, &new_ire); >> - else >> - set_hpet_source_id(msi_desc->hpet_id, &new_ire); >> - new_ire.remap.res_3 = 0; >> - new_ire.remap.res_4 = 0; >> - new_ire.remap.p = 1; /* finally, set present bit */ >> + /* >> + * Actually we can just suppress the update when IRTE is already in posted >> + * format. After a msi gets bound to a guest interrupt, changes to the msi >> + * message have no effect to the IRTE. >> + */ >> + update_irte_for_msi_common(iommu, pdev, msi_desc, msg, pi_desc, gvec); >> >> /* now construct new MSI/MSI-X rte entry */ >> + if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) >> + nr = msi_desc->msi.nvec; > >Why do you re-do here what was already done earlier in the function >(code you didn't touch)? > Will remove this. >> @@ -996,31 +1046,11 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq, >> if ( !ir_ctrl ) >> return -ENODEV; >> >> - spin_lock_irq(&ir_ctrl->iremap_lock); >> - >> - GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p); >> - >> - old_ire = *p; >> - >> - /* Setup/Update interrupt remapping table entry. */ >> - setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec); >> - ret = cmpxchg16b(p, &old_ire, &new_ire); >> - >> - /* >> - * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE, >> - * and the hardware cannot update the IRTE behind us, so the return value >> - * of cmpxchg16 should be the same as old_ire. This ASSERT validate it. >> - */ >> - ASSERT(ret == old_ire.val); >> - >> - iommu_flush_cache_entry(p, sizeof(*p)); >> - iommu_flush_iec_index(iommu, 0, remap_index); >> - >> - unmap_vtd_domain_page(iremap_entries); >> - >> - spin_unlock_irq(&ir_ctrl->iremap_lock); >> - >> - return 0; >> + spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); >> + rc = update_irte_for_msi_common(iommu, pci_dev, msi_desc, NULL, pi_desc, >> + gvec); >> + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); >> + return rc; > >Considering the old code use spin_lock_irq() (and there's such left >also earlier in the function), why do you use the irqsave function >here? Yes, it should be spin_lock_irq(). I saw it used in msi_msg_to_remap_entry() so I followed it without digging deep into the difference. The spin_lock_irq() makes an assumption to the current context, so it's better. Thanks, Chao > >Jan
>>> On 27.02.17 at 02:45, <chao.gao@intel.com> wrote: > @@ -547,16 +548,116 @@ static int remap_entry_to_msi_msg( > return 0; > } > > +/* > + * This function is a common interface to update irte for msi case. > + * > + * If @pi_desc != NULL and @gvec != 0, the IRTE will be updated to a posted > + * format. In this case, @msg is ignored because constructing a posted format > + * IRTE doesn't need any information about the msi address or msi data. > + * > + * If @pi_desc == NULL and @gvec == 0, the IRTE will be updated to a remapped > + * format. In this case, @msg can't be NULL. This kind of implies that in the other case msg can be NULL. Please make this explicit or remove the last sentence, to avoid confusing readers. Plus, if msg can be NULL in that case, why don't you pass NULL in that case? > + * Assume 'ir_ctrl->iremap_lock' has been acquired and the remap_index > + * of msi_desc has a benign value. > + */ > +static int update_irte_for_msi_common( > + struct iommu *iommu, const struct pci_dev *pdev, > + const struct msi_desc *msi_desc, struct msi_msg *msg, > + const struct pi_desc *pi_desc, const uint8_t gvec) > +{ > + struct iremap_entry *iremap_entry = NULL, *iremap_entries; > + struct iremap_entry new_ire = {{0}}; I think just "{ }" will do. > + unsigned int index = msi_desc->remap_index; > + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); > + > + ASSERT( ir_ctrl ); > + ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) ); > + ASSERT( (index >= 0) && (index < IREMAP_ENTRY_NR) ); Stray blanks inside parentheses. > + if ( (!pi_desc && gvec) || (pi_desc && !gvec) ) gvec == 0 alone is never a valid check: Either all vectors are valid, or a whole range (e.g. 0x00...0x0f) is invalid. Furthermore I think such checks are easier to read as either if ( !pi_desc != !gvec ) or if ( pi_desc ? !gvec : gvec ) > + return -EINVAL; > + > + if ( !pi_desc && !gvec && !msg ) With the earlier check the first or second part could be omitted afaict. > + return -EINVAL; > + > + GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, > + iremap_entries, iremap_entry); > + > + if ( !pi_desc ) > + { > + /* Set interrupt remapping table entry */ Again a request for consistency: Either have a respective comment also at the top of the else branch, or omit the one here too. > + new_ire.remap.dm = msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT; > + new_ire.remap.tm = msg->data >> MSI_DATA_TRIGGER_SHIFT; > + new_ire.remap.dlm = msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT; > + /* Hardware require RH = 1 for LPR delivery mode */ > + new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio); > + new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & > + MSI_DATA_VECTOR_MASK; > + if ( x2apic_enabled ) > + new_ire.remap.dst = msg->dest32; > + else > + new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) > + & 0xff) << 8; Please strive to eliminate literal numbers here. At least the 0xff looks to be easy to deal with (using MASK_EXTR() together with MSI_ADDR_DEST_ID_MASK). > + new_ire.remap.p = 1; > + } > + else > + { > + new_ire.post.im = 1; > + new_ire.post.vector = gvec; > + new_ire.post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT); > + new_ire.post.pda_h = virt_to_maddr(pi_desc) >> 32; > + new_ire.post.p = 1; > + } > + > + if ( pdev ) > + set_msi_source_id(pdev, &new_ire); > + else > + set_hpet_source_id(msi_desc->hpet_id, &new_ire); > + > + if ( iremap_entry->val != new_ire.val ) > + { > + if ( cpu_has_cx16 ) > + { > + __uint128_t ret; > + struct iremap_entry old_ire; > + > + old_ire = *iremap_entry; > + ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire); > + > + /* > + * In the above, we use cmpxchg16 to atomically update the 128-bit > + * IRTE, and the hardware cannot update the IRTE behind us, so > + * the return value of cmpxchg16 should be the same as old_ire. > + * This ASSERT validate it. > + */ > + ASSERT(ret == old_ire.val); > + } > + else > + { This wants a comment added explaining the conditions under which this is safe. Perhaps also one or more ASSERT()s to that effect. > + iremap_entry->lo = new_ire.lo; > + iremap_entry->hi = new_ire.hi; > + } > + > + iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry)); sizeof(*iremap_entry) > @@ -592,38 +693,33 @@ static int msi_msg_to_remap_entry( > return -EFAULT; > } > > + /* Get the IRTE's bind relationship with guest from the live IRTE. */ > GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, > iremap_entries, iremap_entry); > - > - memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry)); > - > - /* Set interrupt remapping table entry */ > - new_ire.remap.fpd = 0; > - new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1; > - new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; > - new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; > - /* Hardware require RH = 1 for LPR delivery mode */ > - new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio); > - new_ire.remap.avail = 0; > - new_ire.remap.res_1 = 0; > - new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & > - MSI_DATA_VECTOR_MASK; > - new_ire.remap.res_2 = 0; > - if ( x2apic_enabled ) > - new_ire.remap.dst = msg->dest32; > + if ( !iremap_entry->remap.im ) > + { > + gvec = 0; > + pi_desc = NULL; > + } > else > - new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) > - & 0xff) << 8; > + { > + gvec = iremap_entry->post.vector; > + pi_desc = (void *)((((u64)iremap_entry->post.pda_h) << PDA_LOW_BIT ) > + + iremap_entry->post.pda_l); > + } > + unmap_vtd_domain_page(iremap_entries); I don't follow: Why does it matter what the entry currently holds? As I've pointed out more than once before (mainly to Feng), the goal ought to be to produce the new entry solely based on what the intended new state is, i.e. function input and global data. > - if ( pdev ) > - set_msi_source_id(pdev, &new_ire); > - else > - set_hpet_source_id(msi_desc->hpet_id, &new_ire); > - new_ire.remap.res_3 = 0; > - new_ire.remap.res_4 = 0; > - new_ire.remap.p = 1; /* finally, set present bit */ > + /* > + * Actually we can just suppress the update when IRTE is already in posted > + * format. After a msi gets bound to a guest interrupt, changes to the msi > + * message have no effect to the IRTE. > + */ > + update_irte_for_msi_common(iommu, pdev, msi_desc, msg, pi_desc, gvec); > > /* now construct new MSI/MSI-X rte entry */ > + if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) > + nr = msi_desc->msi.nvec; Why do you re-do here what was already done earlier in the function (code you didn't touch)? > @@ -996,31 +1046,11 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq, > if ( !ir_ctrl ) > return -ENODEV; > > - spin_lock_irq(&ir_ctrl->iremap_lock); > - > - GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p); > - > - old_ire = *p; > - > - /* Setup/Update interrupt remapping table entry. */ > - setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec); > - ret = cmpxchg16b(p, &old_ire, &new_ire); > - > - /* > - * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE, > - * and the hardware cannot update the IRTE behind us, so the return value > - * of cmpxchg16 should be the same as old_ire. This ASSERT validate it. > - */ > - ASSERT(ret == old_ire.val); > - > - iommu_flush_cache_entry(p, sizeof(*p)); > - iommu_flush_iec_index(iommu, 0, remap_index); > - > - unmap_vtd_domain_page(iremap_entries); > - > - spin_unlock_irq(&ir_ctrl->iremap_lock); > - > - return 0; > + spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); > + rc = update_irte_for_msi_common(iommu, pci_dev, msi_desc, NULL, pi_desc, > + gvec); > + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); > + return rc; Considering the old code use spin_lock_irq() (and there's such left also earlier in the function), why do you use the irqsave function here? Jan
>>> On 02.03.17 at 08:14, <chao.gao@intel.com> wrote: > On Thu, Mar 02, 2017 at 01:58:14AM -0700, Jan Beulich wrote: >>>>> On 27.02.17 at 02:45, <chao.gao@intel.com> wrote: >>> + if ( (!pi_desc && gvec) || (pi_desc && !gvec) ) >> >>gvec == 0 alone is never a valid check: Either all vectors are valid, >>or a whole range (e.g. 0x00...0x0f) is invalid. Furthermore I think >>such checks are easier to read as either > > How about only use pi_desc is NULL or not to decide the format of the IRTE? That makes sense, I think. >>> @@ -592,38 +693,33 @@ static int msi_msg_to_remap_entry( >>> return -EFAULT; >>> } >>> >>> + /* Get the IRTE's bind relationship with guest from the live IRTE. */ >>> GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, >>> iremap_entries, iremap_entry); >>> - >>> - memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry)); >>> - >>> - /* Set interrupt remapping table entry */ >>> - new_ire.remap.fpd = 0; >>> - new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1; >>> - new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; >>> - new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; >>> - /* Hardware require RH = 1 for LPR delivery mode */ >>> - new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio); >>> - new_ire.remap.avail = 0; >>> - new_ire.remap.res_1 = 0; >>> - new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & >>> - MSI_DATA_VECTOR_MASK; >>> - new_ire.remap.res_2 = 0; >>> - if ( x2apic_enabled ) >>> - new_ire.remap.dst = msg->dest32; >>> + if ( !iremap_entry->remap.im ) >>> + { >>> + gvec = 0; >>> + pi_desc = NULL; >>> + } >>> else >>> - new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) >>> - & 0xff) << 8; >>> + { >>> + gvec = iremap_entry->post.vector; >>> + pi_desc = (void *)((((u64)iremap_entry->post.pda_h) << PDA_LOW_BIT ) >>> + + iremap_entry->post.pda_l); >>> + } >>> + unmap_vtd_domain_page(iremap_entries); >> >>I don't follow: Why does it matter what the entry currently holds? >>As I've pointed out more than once before (mainly to Feng), the >>goal ought to be to produce the new entry solely based on what >>the intended new state is, i.e. function input and global data. >> > > I think the function introduced by this patch is to produce the new > entry solely based on input. Which contradicts you reading the table entry. > If someone wants to produce the new entry, > it can call it directly. > > I want to explain why we read the entry. > > msi_msg_to_remap_entry() can be called before a msi gets bound to a guest > interrupt or after that. If we call the function without realizing the msi > has been binded to a guest interrupt, the IRTE would be updated to a > remapped format breaking the binding (at least breaking the intention to use > VT-d PI). I think this is a possible case in the current code. This patch avoids > this case and provides a new function to the callers who are intended to replace > a posted format IRTE with a remapped format IRTE. Reading this entry is to get > the binding information and use it to update IRTE ( as comments in code, when > the IRTE is in posted format, we can suppress the update since the content > of IRTE will not change for the binding information hasn't change. and Also > if the binding information changed, we should call pi_update_irte ). Just to repeat what I've said a number of times before: Preferably the new entry should be created from scratch, and all data needed to do so should be passed in (or be derivable from passed in data, e.g. by looking up global data structures _other than the IRT_ ). > At this moment, we don't recognize any existing caller of > msi_msg_to_remap_entry() needs to update a posted IRTE to a remapped IRTE. > If the need emerges, we can expose the common function to the callers. This restriction would go away with the function being fully self contained, as re-explained above. The second best option (if the preferred one is overly complicated to implement) is to make the function actively refuse anything it can't currently handle. I don't think that's the case yet with this version of the patch. >>> @@ -996,31 +1046,11 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq, >>> if ( !ir_ctrl ) >>> return -ENODEV; >>> >>> - spin_lock_irq(&ir_ctrl->iremap_lock); >>> - >>> - GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p); >>> - >>> - old_ire = *p; >>> - >>> - /* Setup/Update interrupt remapping table entry. */ >>> - setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec); >>> - ret = cmpxchg16b(p, &old_ire, &new_ire); >>> - >>> - /* >>> - * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE, >>> - * and the hardware cannot update the IRTE behind us, so the return value >>> - * of cmpxchg16 should be the same as old_ire. This ASSERT validate it. >>> - */ >>> - ASSERT(ret == old_ire.val); >>> - >>> - iommu_flush_cache_entry(p, sizeof(*p)); >>> - iommu_flush_iec_index(iommu, 0, remap_index); >>> - >>> - unmap_vtd_domain_page(iremap_entries); >>> - >>> - spin_unlock_irq(&ir_ctrl->iremap_lock); >>> - >>> - return 0; >>> + spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); >>> + rc = update_irte_for_msi_common(iommu, pci_dev, msi_desc, NULL, pi_desc, >>> + gvec); >>> + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); >>> + return rc; >> >>Considering the old code use spin_lock_irq() (and there's such left >>also earlier in the function), why do you use the irqsave function >>here? > > Yes, it should be spin_lock_irq(). I saw it used in msi_msg_to_remap_entry() so > I followed it without digging deep into the difference. The spin_lock_irq() > makes an assumption to the current context, so it's better. It's not necessarily better (or worse), but it's consistent with neighboring code. Jan
diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c index bfd468b..4269cd4 100644 --- a/xen/drivers/passthrough/vtd/intremap.c +++ b/xen/drivers/passthrough/vtd/intremap.c @@ -420,7 +420,8 @@ void io_apic_write_remap_rte( __ioapic_write_entry(apic, ioapic_pin, 1, old_rte); } -static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire) +static void set_msi_source_id(const struct pci_dev *pdev, + struct iremap_entry *ire) { u16 seg; u8 bus, devfn, secbus; @@ -547,16 +548,116 @@ static int remap_entry_to_msi_msg( return 0; } +/* + * This function is a common interface to update irte for msi case. + * + * If @pi_desc != NULL and @gvec != 0, the IRTE will be updated to a posted + * format. In this case, @msg is ignored because constructing a posted format + * IRTE doesn't need any information about the msi address or msi data. + * + * If @pi_desc == NULL and @gvec == 0, the IRTE will be updated to a remapped + * format. In this case, @msg can't be NULL. + * + * Assume 'ir_ctrl->iremap_lock' has been acquired and the remap_index + * of msi_desc has a benign value. + */ +static int update_irte_for_msi_common( + struct iommu *iommu, const struct pci_dev *pdev, + const struct msi_desc *msi_desc, struct msi_msg *msg, + const struct pi_desc *pi_desc, const uint8_t gvec) +{ + struct iremap_entry *iremap_entry = NULL, *iremap_entries; + struct iremap_entry new_ire = {{0}}; + unsigned int index = msi_desc->remap_index; + struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + + ASSERT( ir_ctrl ); + ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) ); + ASSERT( (index >= 0) && (index < IREMAP_ENTRY_NR) ); + + if ( (!pi_desc && gvec) || (pi_desc && !gvec) ) + return -EINVAL; + + if ( !pi_desc && !gvec && !msg ) + return -EINVAL; + + GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, + iremap_entries, iremap_entry); + + if ( !pi_desc ) + { + /* Set interrupt remapping table entry */ + new_ire.remap.dm = msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT; + new_ire.remap.tm = msg->data >> MSI_DATA_TRIGGER_SHIFT; + new_ire.remap.dlm = msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT; + /* Hardware require RH = 1 for LPR delivery mode */ + new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio); + new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & + MSI_DATA_VECTOR_MASK; + if ( x2apic_enabled ) + new_ire.remap.dst = msg->dest32; + else + new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) + & 0xff) << 8; + new_ire.remap.p = 1; + } + else + { + new_ire.post.im = 1; + new_ire.post.vector = gvec; + new_ire.post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT); + new_ire.post.pda_h = virt_to_maddr(pi_desc) >> 32; + new_ire.post.p = 1; + } + + if ( pdev ) + set_msi_source_id(pdev, &new_ire); + else + set_hpet_source_id(msi_desc->hpet_id, &new_ire); + + if ( iremap_entry->val != new_ire.val ) + { + if ( cpu_has_cx16 ) + { + __uint128_t ret; + struct iremap_entry old_ire; + + old_ire = *iremap_entry; + ret = cmpxchg16b(iremap_entry, &old_ire, &new_ire); + + /* + * In the above, we use cmpxchg16 to atomically update the 128-bit + * IRTE, and the hardware cannot update the IRTE behind us, so + * the return value of cmpxchg16 should be the same as old_ire. + * This ASSERT validate it. + */ + ASSERT(ret == old_ire.val); + } + else + { + iremap_entry->lo = new_ire.lo; + iremap_entry->hi = new_ire.hi; + } + + iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry)); + iommu_flush_iec_index(iommu, 0, index); + } + + unmap_vtd_domain_page(iremap_entries); + return 0; +} + static int msi_msg_to_remap_entry( - struct iommu *iommu, struct pci_dev *pdev, + struct iommu *iommu, const struct pci_dev *pdev, struct msi_desc *msi_desc, struct msi_msg *msg) { struct iremap_entry *iremap_entry = NULL, *iremap_entries; - struct iremap_entry new_ire; struct msi_msg_remap_entry *remap_rte; unsigned int index, i, nr = 1; unsigned long flags; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); + void *pi_desc; + int gvec; if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) nr = msi_desc->msi.nvec; @@ -592,38 +693,33 @@ static int msi_msg_to_remap_entry( return -EFAULT; } + /* Get the IRTE's bind relationship with guest from the live IRTE. */ GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, iremap_entries, iremap_entry); - - memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry)); - - /* Set interrupt remapping table entry */ - new_ire.remap.fpd = 0; - new_ire.remap.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1; - new_ire.remap.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; - new_ire.remap.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; - /* Hardware require RH = 1 for LPR delivery mode */ - new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio); - new_ire.remap.avail = 0; - new_ire.remap.res_1 = 0; - new_ire.remap.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & - MSI_DATA_VECTOR_MASK; - new_ire.remap.res_2 = 0; - if ( x2apic_enabled ) - new_ire.remap.dst = msg->dest32; + if ( !iremap_entry->remap.im ) + { + gvec = 0; + pi_desc = NULL; + } else - new_ire.remap.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) - & 0xff) << 8; + { + gvec = iremap_entry->post.vector; + pi_desc = (void *)((((u64)iremap_entry->post.pda_h) << PDA_LOW_BIT ) + + iremap_entry->post.pda_l); + } + unmap_vtd_domain_page(iremap_entries); - if ( pdev ) - set_msi_source_id(pdev, &new_ire); - else - set_hpet_source_id(msi_desc->hpet_id, &new_ire); - new_ire.remap.res_3 = 0; - new_ire.remap.res_4 = 0; - new_ire.remap.p = 1; /* finally, set present bit */ + /* + * Actually we can just suppress the update when IRTE is already in posted + * format. After a msi gets bound to a guest interrupt, changes to the msi + * message have no effect to the IRTE. + */ + update_irte_for_msi_common(iommu, pdev, msi_desc, msg, pi_desc, gvec); /* now construct new MSI/MSI-X rte entry */ + if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) + nr = msi_desc->msi.nvec; + remap_rte = (struct msi_msg_remap_entry *)msg; remap_rte->address_lo.dontcare = 0; i = index; @@ -637,11 +733,6 @@ static int msi_msg_to_remap_entry( remap_rte->address_hi = 0; remap_rte->data = index - i; - memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); - iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry)); - iommu_flush_iec_index(iommu, 0, index); - - unmap_vtd_domain_page(iremap_entries); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return 0; } @@ -902,42 +993,6 @@ void iommu_disable_x2apic_IR(void) disable_qinval(drhd->iommu); } -static void setup_posted_irte( - struct iremap_entry *new_ire, const struct iremap_entry *old_ire, - const struct pi_desc *pi_desc, const uint8_t gvec) -{ - memset(new_ire, 0, sizeof(*new_ire)); - - /* - * 'im' filed decides whether the irte is in posted format (with value 1) - * or remapped format (with value 0), if the old irte is in remapped format, - * we copy things from remapped part in 'struct iremap_entry', otherwise, - * we copy from posted part. - */ - if ( !old_ire->remap.im ) - { - new_ire->post.p = old_ire->remap.p; - new_ire->post.fpd = old_ire->remap.fpd; - new_ire->post.sid = old_ire->remap.sid; - new_ire->post.sq = old_ire->remap.sq; - new_ire->post.svt = old_ire->remap.svt; - } - else - { - new_ire->post.p = old_ire->post.p; - new_ire->post.fpd = old_ire->post.fpd; - new_ire->post.sid = old_ire->post.sid; - new_ire->post.sq = old_ire->post.sq; - new_ire->post.svt = old_ire->post.svt; - new_ire->post.urg = old_ire->post.urg; - } - - new_ire->post.im = 1; - new_ire->post.vector = gvec; - new_ire->post.pda_l = virt_to_maddr(pi_desc) >> (32 - PDA_LOW_BIT); - new_ire->post.pda_h = virt_to_maddr(pi_desc) >> 32; -} - /* * This function is used to update the IRTE for posted-interrupt * when guest changes MSI/MSI-X information. @@ -947,16 +1002,13 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq, { struct irq_desc *desc; const struct msi_desc *msi_desc; - int remap_index; int rc = 0; const struct pci_dev *pci_dev; const struct acpi_drhd_unit *drhd; struct iommu *iommu; struct ir_ctrl *ir_ctrl; - struct iremap_entry *iremap_entries = NULL, *p = NULL; - struct iremap_entry new_ire, old_ire; + unsigned long flags; const struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc; - __uint128_t ret; desc = pirq_spin_lock_irq_desc(pirq, NULL); if ( !desc ) @@ -976,8 +1028,6 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq, goto unlock_out; } - remap_index = msi_desc->remap_index; - spin_unlock_irq(&desc->lock); ASSERT(pcidevs_locked()); @@ -996,31 +1046,11 @@ int pi_update_irte(const struct vcpu *v, const struct pirq *pirq, if ( !ir_ctrl ) return -ENODEV; - spin_lock_irq(&ir_ctrl->iremap_lock); - - GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, remap_index, iremap_entries, p); - - old_ire = *p; - - /* Setup/Update interrupt remapping table entry. */ - setup_posted_irte(&new_ire, &old_ire, pi_desc, gvec); - ret = cmpxchg16b(p, &old_ire, &new_ire); - - /* - * In the above, we use cmpxchg16 to atomically update the 128-bit IRTE, - * and the hardware cannot update the IRTE behind us, so the return value - * of cmpxchg16 should be the same as old_ire. This ASSERT validate it. - */ - ASSERT(ret == old_ire.val); - - iommu_flush_cache_entry(p, sizeof(*p)); - iommu_flush_iec_index(iommu, 0, remap_index); - - unmap_vtd_domain_page(iremap_entries); - - spin_unlock_irq(&ir_ctrl->iremap_lock); - - return 0; + spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); + rc = update_irte_for_msi_common(iommu, pci_dev, msi_desc, NULL, pi_desc, + gvec); + spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); + return rc; unlock_out: spin_unlock_irq(&desc->lock);