Message ID | 20250303070501.2740392-1-danielsftsai@google.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Krzysztof Wilczyński |
Headers | show |
Series | PCI: dwc: Chain the set IRQ affinity request back to the parent | expand |
On Mon, Mar 03 2025 at 07:05, Daniel Tsai wrote: > +/* > + * The algo here honor if there is any intersection of mask of > + * the existing MSI vectors and the requesting MSI vector. So we > + * could handle both narrow (1 bit set mask) and wide (0xffff...) > + * cases, return -EINVAL and reject the request if the result of > + * cpumask is empty, otherwise return 0 and have the calculated > + * result on the mask_to_check to pass down to the irq_chip. > + */ > +static int dw_pci_check_mask_compatibility(struct dw_pcie_rp *pp, > + unsigned long ctrl, > + unsigned long hwirq_to_check, > + struct cpumask *mask_to_check) > +{ > + unsigned long end, hwirq; > + const struct cpumask *mask; > + unsigned int virq; > + > + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; > + end = hwirq + MAX_MSI_IRQS_PER_CTRL; > + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { > + if (hwirq == hwirq_to_check) > + continue; > + virq = irq_find_mapping(pp->irq_domain, hwirq); > + if (!virq) > + continue; > + mask = irq_get_affinity_mask(virq); What protects @mask against a concurrent modification? > + if (!cpumask_and(mask_to_check, mask, mask_to_check)) > + return -EINVAL; > + } > + > + return 0; > +} > + > +static void dw_pci_update_effective_affinity(struct dw_pcie_rp *pp, > + unsigned long ctrl, > + const struct cpumask *effective_mask, > + unsigned long hwirq_to_check) > +{ > + struct irq_desc *desc_downstream; > + unsigned int virq_downstream; > + unsigned long end, hwirq; > + > + /* > + * Update all the irq_data's effective mask > + * bind to this MSI controller, so the correct > + * affinity would reflect on > + * /proc/irq/XXX/effective_affinity > + */ > + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; > + end = hwirq + MAX_MSI_IRQS_PER_CTRL; > + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { > + virq_downstream = irq_find_mapping(pp->irq_domain, hwirq); > + if (!virq_downstream) > + continue; > + desc_downstream = irq_to_desc(virq_downstream); > + irq_data_update_effective_affinity(&desc_downstream->irq_data, > + effective_mask); Same here. > + } > +} > + > +static int dw_pci_msi_set_affinity(struct irq_data *d, > + const struct cpumask *mask, bool force) > +{ > + struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d); > + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); > + int ret; > + int virq_parent; > + unsigned long hwirq = d->hwirq; > + unsigned long flags, ctrl; > + struct irq_desc *desc_parent; > + const struct cpumask *effective_mask; > + cpumask_var_t mask_result; > + > + ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL; > + if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC)) > + return -ENOMEM; This does not work on a RT enabled kernel. Allocations with a raw spin lock held are not possible. > + /* > + * Loop through all possible MSI vector to check if the > + * requested one is compatible with all of them > + */ > + raw_spin_lock_irqsave(&pp->lock, flags); > + cpumask_copy(mask_result, mask); > + ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result); > + if (ret) { > + dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n", > + cpumask_pr_args(mask), d->irq); > + goto unlock; > + } > + > + dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n", > + cpumask_pr_args(mask_result), d->irq); > + > + virq_parent = pp->msi_irq[ctrl]; > + desc_parent = irq_to_desc(virq_parent); > + ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data, > + mask_result, force); Again. Completely unserialized. Thanks, tglx
Hi Daniel, kernel test robot noticed the following build errors: [auto build test ERROR on pci/next] [also build test ERROR on pci/for-linus mani-mhi/mhi-next linus/master v6.14-rc5 next-20250228] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Daniel-Tsai/PCI-dwc-Chain-the-set-IRQ-affinity-request-back-to-the-parent/20250303-150704 base: https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next patch link: https://lore.kernel.org/r/20250303070501.2740392-1-danielsftsai%40google.com patch subject: [PATCH] PCI: dwc: Chain the set IRQ affinity request back to the parent config: x86_64-buildonly-randconfig-006-20250303 (https://download.01.org/0day-ci/archive/20250303/202503031726.KLKoMMZk-lkp@intel.com/config) compiler: clang version 19.1.7 (https://github.com/llvm/llvm-project cd708029e0b2869e80abe31ddb175f7c35361f90) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250303/202503031726.KLKoMMZk-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202503031726.KLKoMMZk-lkp@intel.com/ All errors (new ones prefixed by >>): >> drivers/pci/controller/dwc/pcie-designware-host.c:223:45: error: no member named 'affinity' in 'struct irq_common_data' 223 | cpumask_copy(desc_parent->irq_common_data.affinity, mask); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ drivers/pci/controller/dwc/pcie-designware-host.c:507:30: warning: shift count >= width of type [-Wshift-count-overflow] 507 | dma_set_coherent_mask(dev, DMA_BIT_MASK(64)); | ^~~~~~~~~~~~~~~~ include/linux/dma-mapping.h:73:54: note: expanded from macro 'DMA_BIT_MASK' 73 | #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) | ^ ~~~ 1 warning and 1 error generated. vim +223 drivers/pci/controller/dwc/pcie-designware-host.c 178 179 static int dw_pci_msi_set_affinity(struct irq_data *d, 180 const struct cpumask *mask, bool force) 181 { 182 struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d); 183 struct dw_pcie *pci = to_dw_pcie_from_pp(pp); 184 int ret; 185 int virq_parent; 186 unsigned long hwirq = d->hwirq; 187 unsigned long flags, ctrl; 188 struct irq_desc *desc_parent; 189 const struct cpumask *effective_mask; 190 cpumask_var_t mask_result; 191 192 ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL; 193 if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC)) 194 return -ENOMEM; 195 196 /* 197 * Loop through all possible MSI vector to check if the 198 * requested one is compatible with all of them 199 */ 200 raw_spin_lock_irqsave(&pp->lock, flags); 201 cpumask_copy(mask_result, mask); 202 ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result); 203 if (ret) { 204 dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n", 205 cpumask_pr_args(mask), d->irq); 206 goto unlock; 207 } 208 209 dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n", 210 cpumask_pr_args(mask_result), d->irq); 211 212 virq_parent = pp->msi_irq[ctrl]; 213 desc_parent = irq_to_desc(virq_parent); 214 ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data, 215 mask_result, force); 216 217 if (ret < 0) 218 goto unlock; 219 220 switch (ret) { 221 case IRQ_SET_MASK_OK: 222 case IRQ_SET_MASK_OK_DONE: > 223 cpumask_copy(desc_parent->irq_common_data.affinity, mask); 224 fallthrough; 225 case IRQ_SET_MASK_OK_NOCOPY: 226 break; 227 } 228 229 effective_mask = irq_data_get_effective_affinity_mask(&desc_parent->irq_data); 230 dw_pci_update_effective_affinity(pp, ctrl, effective_mask, hwirq); 231 232 unlock: 233 free_cpumask_var(mask_result); 234 raw_spin_unlock_irqrestore(&pp->lock, flags); 235 return ret < 0 ? ret : IRQ_SET_MASK_OK_NOCOPY; 236 } 237
Hi Daniel, kernel test robot noticed the following build errors: [auto build test ERROR on pci/next] [also build test ERROR on pci/for-linus mani-mhi/mhi-next linus/master v6.14-rc5 next-20250228] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Daniel-Tsai/PCI-dwc-Chain-the-set-IRQ-affinity-request-back-to-the-parent/20250303-150704 base: https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next patch link: https://lore.kernel.org/r/20250303070501.2740392-1-danielsftsai%40google.com patch subject: [PATCH] PCI: dwc: Chain the set IRQ affinity request back to the parent config: sparc64-randconfig-002-20250303 (https://download.01.org/0day-ci/archive/20250303/202503031759.oiGkE9fl-lkp@intel.com/config) compiler: sparc64-linux-gcc (GCC) 14.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250303/202503031759.oiGkE9fl-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202503031759.oiGkE9fl-lkp@intel.com/ All errors (new ones prefixed by >>): drivers/pci/controller/dwc/pcie-designware-host.c: In function 'dw_pci_msi_set_affinity': >> drivers/pci/controller/dwc/pcie-designware-host.c:223:58: error: 'struct irq_common_data' has no member named 'affinity' 223 | cpumask_copy(desc_parent->irq_common_data.affinity, mask); | ^ vim +223 drivers/pci/controller/dwc/pcie-designware-host.c 178 179 static int dw_pci_msi_set_affinity(struct irq_data *d, 180 const struct cpumask *mask, bool force) 181 { 182 struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d); 183 struct dw_pcie *pci = to_dw_pcie_from_pp(pp); 184 int ret; 185 int virq_parent; 186 unsigned long hwirq = d->hwirq; 187 unsigned long flags, ctrl; 188 struct irq_desc *desc_parent; 189 const struct cpumask *effective_mask; 190 cpumask_var_t mask_result; 191 192 ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL; 193 if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC)) 194 return -ENOMEM; 195 196 /* 197 * Loop through all possible MSI vector to check if the 198 * requested one is compatible with all of them 199 */ 200 raw_spin_lock_irqsave(&pp->lock, flags); 201 cpumask_copy(mask_result, mask); 202 ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result); 203 if (ret) { 204 dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n", 205 cpumask_pr_args(mask), d->irq); 206 goto unlock; 207 } 208 209 dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n", 210 cpumask_pr_args(mask_result), d->irq); 211 212 virq_parent = pp->msi_irq[ctrl]; 213 desc_parent = irq_to_desc(virq_parent); 214 ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data, 215 mask_result, force); 216 217 if (ret < 0) 218 goto unlock; 219 220 switch (ret) { 221 case IRQ_SET_MASK_OK: 222 case IRQ_SET_MASK_OK_DONE: > 223 cpumask_copy(desc_parent->irq_common_data.affinity, mask); 224 fallthrough; 225 case IRQ_SET_MASK_OK_NOCOPY: 226 break; 227 } 228 229 effective_mask = irq_data_get_effective_affinity_mask(&desc_parent->irq_data); 230 dw_pci_update_effective_affinity(pp, ctrl, effective_mask, hwirq); 231 232 unlock: 233 free_cpumask_var(mask_result); 234 raw_spin_unlock_irqrestore(&pp->lock, flags); 235 return ret < 0 ? ret : IRQ_SET_MASK_OK_NOCOPY; 236 } 237
On Mon, Mar 3, 2025 at 5:10 PM Thomas Gleixner <tglx@linutronix.de> wrote: > > On Mon, Mar 03 2025 at 07:05, Daniel Tsai wrote: > > +/* > > + * The algo here honor if there is any intersection of mask of > > + * the existing MSI vectors and the requesting MSI vector. So we > > + * could handle both narrow (1 bit set mask) and wide (0xffff...) > > + * cases, return -EINVAL and reject the request if the result of > > + * cpumask is empty, otherwise return 0 and have the calculated > > + * result on the mask_to_check to pass down to the irq_chip. > > + */ > > +static int dw_pci_check_mask_compatibility(struct dw_pcie_rp *pp, > > + unsigned long ctrl, > > + unsigned long hwirq_to_check, > > + struct cpumask *mask_to_check) > > +{ > > + unsigned long end, hwirq; > > + const struct cpumask *mask; > > + unsigned int virq; > > + > > + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; > > + end = hwirq + MAX_MSI_IRQS_PER_CTRL; > > + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { > > + if (hwirq == hwirq_to_check) > > + continue; > > + virq = irq_find_mapping(pp->irq_domain, hwirq); > > + if (!virq) > > + continue; > > + mask = irq_get_affinity_mask(virq); > > What protects @mask against a concurrent modification? We hold the &pp->lock in the dw_pci_msi_set_affinity before calling this function so that should help to protect against concurrent modification ? > > > + if (!cpumask_and(mask_to_check, mask, mask_to_check)) > > + return -EINVAL; > > + } > > + > > + return 0; > > +} > > + > > +static void dw_pci_update_effective_affinity(struct dw_pcie_rp *pp, > > + unsigned long ctrl, > > + const struct cpumask *effective_mask, > > + unsigned long hwirq_to_check) > > +{ > > + struct irq_desc *desc_downstream; > > + unsigned int virq_downstream; > > + unsigned long end, hwirq; > > + > > + /* > > + * Update all the irq_data's effective mask > > + * bind to this MSI controller, so the correct > > + * affinity would reflect on > > + * /proc/irq/XXX/effective_affinity > > + */ > > + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; > > + end = hwirq + MAX_MSI_IRQS_PER_CTRL; > > + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { > > + virq_downstream = irq_find_mapping(pp->irq_domain, hwirq); > > + if (!virq_downstream) > > + continue; > > + desc_downstream = irq_to_desc(virq_downstream); > > + irq_data_update_effective_affinity(&desc_downstream->irq_data, > > + effective_mask); > > Same here. We hold the &pp->lock in the dw_pci_msi_set_affinity before calling here, so that could help I think ? > > > + } > > +} > > + > > +static int dw_pci_msi_set_affinity(struct irq_data *d, > > + const struct cpumask *mask, bool force) > > +{ > > + struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d); > > + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); > > + int ret; > > + int virq_parent; > > + unsigned long hwirq = d->hwirq; > > + unsigned long flags, ctrl; > > + struct irq_desc *desc_parent; > > + const struct cpumask *effective_mask; > > + cpumask_var_t mask_result; > > + > > + ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL; > > + if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC)) > > + return -ENOMEM; > > This does not work on a RT enabled kernel. Allocations with a raw spin > lock held are not possible. > Even with the GFP_ATOMIC flag ? I thought that means it would be safe to do allocation in the atomic context ? Didn't work on the RT kernel before, so please enlighten me if I am wrong and if you don't mind. > > + /* > > + * Loop through all possible MSI vector to check if the > > + * requested one is compatible with all of them > > + */ > > + raw_spin_lock_irqsave(&pp->lock, flags); > > + cpumask_copy(mask_result, mask); > > + ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result); > > + if (ret) { > > + dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n", > > + cpumask_pr_args(mask), d->irq); > > + goto unlock; > > + } > > + > > + dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n", > > + cpumask_pr_args(mask_result), d->irq); > > + > > + virq_parent = pp->msi_irq[ctrl]; > > + desc_parent = irq_to_desc(virq_parent); > > + ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data, > > + mask_result, force); > > Again. Completely unserialized. > The reason why we remove the desc_parent->lock protection is because the chained IRQ structure didn't export parent IRQ to the user space, so we think this call path should be the only caller. And since we have &pp->lock hold at the beginning, so that should help to protect against concurrent modification here ? > Thanks, > > tglx Sorry for the test build error, I will fix it and re-send a new patch. Thanks
On Tue, Mar 04 2025 at 13:48, Tsai Sung-Fu wrote: > On Mon, Mar 3, 2025 at 5:10 PM Thomas Gleixner <tglx@linutronix.de> wrote: >> > + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; >> > + end = hwirq + MAX_MSI_IRQS_PER_CTRL; >> > + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { >> > + if (hwirq == hwirq_to_check) >> > + continue; >> > + virq = irq_find_mapping(pp->irq_domain, hwirq); >> > + if (!virq) >> > + continue; >> > + mask = irq_get_affinity_mask(virq); >> >> What protects @mask against a concurrent modification? > > We hold the &pp->lock in the dw_pci_msi_set_affinity before calling > this function > so that should help to protect against concurrent modification ? So that's the same domain, right? >> > + /* >> > + * Update all the irq_data's effective mask >> > + * bind to this MSI controller, so the correct >> > + * affinity would reflect on >> > + * /proc/irq/XXX/effective_affinity >> > + */ >> > + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; >> > + end = hwirq + MAX_MSI_IRQS_PER_CTRL; >> > + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { >> > + virq_downstream = irq_find_mapping(pp->irq_domain, hwirq); >> > + if (!virq_downstream) >> > + continue; >> > + desc_downstream = irq_to_desc(virq_downstream); >> > + irq_data_update_effective_affinity(&desc_downstream->irq_data, >> > + effective_mask); >> >> Same here. > > We hold the &pp->lock in the dw_pci_msi_set_affinity before calling > here, so that could help I think ? A comment would be helpful for the casual reader. >> >> > + } >> > +} >> > + >> > +static int dw_pci_msi_set_affinity(struct irq_data *d, >> > + const struct cpumask *mask, bool force) >> > +{ >> > + struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d); >> > + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); >> > + int ret; >> > + int virq_parent; >> > + unsigned long hwirq = d->hwirq; >> > + unsigned long flags, ctrl; >> > + struct irq_desc *desc_parent; >> > + const struct cpumask *effective_mask; >> > + cpumask_var_t mask_result; >> > + >> > + ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL; >> > + if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC)) >> > + return -ENOMEM; >> >> This does not work on a RT enabled kernel. Allocations with a raw spin >> lock held are not possible. >> > > Even with the GFP_ATOMIC flag ? I thought that means it would be safe > to do allocation in the atomic context ? > Didn't work on the RT kernel before, so please enlighten me if I am > wrong and if you don't mind. https://www.kernel.org/doc/html/latest/locking/locktypes.html#preempt-rt-caveats https://www.kernel.org/doc/html/latest/locking/locktypes.html#raw-spinlock-t-on-rt >> > + /* >> > + * Loop through all possible MSI vector to check if the >> > + * requested one is compatible with all of them >> > + */ >> > + raw_spin_lock_irqsave(&pp->lock, flags); >> > + cpumask_copy(mask_result, mask); >> > + ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result); >> > + if (ret) { >> > + dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n", >> > + cpumask_pr_args(mask), d->irq); >> > + goto unlock; >> > + } >> > + >> > + dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n", >> > + cpumask_pr_args(mask_result), d->irq); >> > + >> > + virq_parent = pp->msi_irq[ctrl]; >> > + desc_parent = irq_to_desc(virq_parent); >> > + ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data, >> > + mask_result, force); >> >> Again. Completely unserialized. >> > > The reason why we remove the desc_parent->lock protection is because > the chained IRQ structure didn't export parent IRQ to the user space, so we > think this call path should be the only caller. And since we have &pp->lock hold > at the beginning, so that should help to protect against concurrent > modification here ? "Should be the only caller" is not really a technical argument. If you make assumptions like this, then you have to come up with a proper explanation why this is correct under all circumstances and with all possible variants of parent interrupt chips. Aside of that fiddling with the internals of interrupt descriptors is not really justified here. What's wrong with using the existing irq_set_affinity() mechanism? Thanks, tglx
On Tue, Mar 4, 2025 at 5:46 PM Thomas Gleixner <tglx@linutronix.de> wrote: > > On Tue, Mar 04 2025 at 13:48, Tsai Sung-Fu wrote: > > On Mon, Mar 3, 2025 at 5:10 PM Thomas Gleixner <tglx@linutronix.de> wrote: > >> > + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; > >> > + end = hwirq + MAX_MSI_IRQS_PER_CTRL; > >> > + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { > >> > + if (hwirq == hwirq_to_check) > >> > + continue; > >> > + virq = irq_find_mapping(pp->irq_domain, hwirq); > >> > + if (!virq) > >> > + continue; > >> > + mask = irq_get_affinity_mask(virq); > >> > >> What protects @mask against a concurrent modification? > > > > We hold the &pp->lock in the dw_pci_msi_set_affinity before calling > > this function > > so that should help to protect against concurrent modification ? > > So that's the same domain, right? > Yes, all from pp->irq_domain allocated by the driver by calling to irq_domain_create_linear(). I will add the lockdep_assert_held() check. > >> > + /* > >> > + * Update all the irq_data's effective mask > >> > + * bind to this MSI controller, so the correct > >> > + * affinity would reflect on > >> > + * /proc/irq/XXX/effective_affinity > >> > + */ > >> > + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; > >> > + end = hwirq + MAX_MSI_IRQS_PER_CTRL; > >> > + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { > >> > + virq_downstream = irq_find_mapping(pp->irq_domain, hwirq); > >> > + if (!virq_downstream) > >> > + continue; > >> > + desc_downstream = irq_to_desc(virq_downstream); > >> > + irq_data_update_effective_affinity(&desc_downstream->irq_data, > >> > + effective_mask); > >> > >> Same here. > > > > We hold the &pp->lock in the dw_pci_msi_set_affinity before calling > > here, so that could help I think ? > > A comment would be helpful for the casual reader. Sure, will also add the lockdep_assert_held() check > > >> > >> > + } > >> > +} > >> > + > >> > +static int dw_pci_msi_set_affinity(struct irq_data *d, > >> > + const struct cpumask *mask, bool force) > >> > +{ > >> > + struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d); > >> > + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); > >> > + int ret; > >> > + int virq_parent; > >> > + unsigned long hwirq = d->hwirq; > >> > + unsigned long flags, ctrl; > >> > + struct irq_desc *desc_parent; > >> > + const struct cpumask *effective_mask; > >> > + cpumask_var_t mask_result; > >> > + > >> > + ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL; > >> > + if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC)) > >> > + return -ENOMEM; > >> > >> This does not work on a RT enabled kernel. Allocations with a raw spin > >> lock held are not possible. > >> > > > > Even with the GFP_ATOMIC flag ? I thought that means it would be safe > > to do allocation in the atomic context ? > > Didn't work on the RT kernel before, so please enlighten me if I am > > wrong and if you don't mind. > > https://www.kernel.org/doc/html/latest/locking/locktypes.html#preempt-rt-caveats > https://www.kernel.org/doc/html/latest/locking/locktypes.html#raw-spinlock-t-on-rt > I will remove the Allocation and move the cpumask to the device structure. Thanks for the doc. > >> > + /* > >> > + * Loop through all possible MSI vector to check if the > >> > + * requested one is compatible with all of them > >> > + */ > >> > + raw_spin_lock_irqsave(&pp->lock, flags); > >> > + cpumask_copy(mask_result, mask); > >> > + ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result); > >> > + if (ret) { > >> > + dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n", > >> > + cpumask_pr_args(mask), d->irq); > >> > + goto unlock; > >> > + } > >> > + > >> > + dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n", > >> > + cpumask_pr_args(mask_result), d->irq); > >> > + > >> > + virq_parent = pp->msi_irq[ctrl]; > >> > + desc_parent = irq_to_desc(virq_parent); > >> > + ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data, > >> > + mask_result, force); > >> > >> Again. Completely unserialized. > >> > > > > The reason why we remove the desc_parent->lock protection is because > > the chained IRQ structure didn't export parent IRQ to the user space, so we > > think this call path should be the only caller. And since we have &pp->lock hold > > at the beginning, so that should help to protect against concurrent > > modification here ? > > "Should be the only caller" is not really a technical argument. If you > make assumptions like this, then you have to come up with a proper > explanation why this is correct under all circumstances and with all > possible variants of parent interrupt chips. > > Aside of that fiddling with the internals of interrupt descriptors is > not really justified here. What's wrong with using the existing > irq_set_affinity() mechanism? > > Thanks, > > tglx > Using irq_set_affinity() would give us some circular deadlock warning at the probe stage. irq_startup() flow would hold the global lock from irq_setup_affinity(), so deadlock scenario like this below might happen. The irq_set_affinity() would try to acquire &irq_desc_lock_class while the dw_pci_msi_set_affinity() already hold the &pp->lock. To resolve that we would like to reuse the idea to replace the global lock in irq_set_affinity() with PERCPU structure from this patch -> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=64b6d1d7a84538de34c22a6fc92a7dcc2b196b64 Just would like to know if this looks feasible to you ? Chain exists of: &irq_desc_lock_class --> mask_lock --> &pp->lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&pp->lock); lock(mask_lock); lock(&pp->lock); lock(&irq_desc_lock_class); *** DEADLOCK *** Thanks
On Wed, Mar 05 2025 at 19:21, Tsai Sung-Fu wrote: > On Tue, Mar 4, 2025 at 5:46 PM Thomas Gleixner <tglx@linutronix.de> wrote: >> >> > + ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data, >> >> > + mask_result, force); >> >> >> >> Again. Completely unserialized. >> >> >> > >> > The reason why we remove the desc_parent->lock protection is because >> > the chained IRQ structure didn't export parent IRQ to the user space, so we >> > think this call path should be the only caller. And since we have &pp->lock hold >> > at the beginning, so that should help to protect against concurrent >> > modification here ? >> >> "Should be the only caller" is not really a technical argument. If you >> make assumptions like this, then you have to come up with a proper >> explanation why this is correct under all circumstances and with all >> possible variants of parent interrupt chips. >> >> Aside of that fiddling with the internals of interrupt descriptors is >> not really justified here. What's wrong with using the existing >> irq_set_affinity() mechanism? >> > Using irq_set_affinity() would give us some circular deadlock warning > at the probe stage. > irq_startup() flow would hold the global lock from irq_setup_affinity(), so > deadlock scenario like this below might happen. The irq_set_affinity() > would try to acquire &irq_desc_lock_class > while the dw_pci_msi_set_affinity() already hold the &pp->lock. To > resolve that we would like to reuse the idea to > replace the global lock in irq_set_affinity() with PERCPU structure > from this patch -> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=64b6d1d7a84538de34c22a6fc92a7dcc2b196b64 > Just would like to know if this looks feasible to you ? It won't help. See below. > Chain exists of: > &irq_desc_lock_class --> mask_lock --> &pp->lock > Possible unsafe locking scenario: > CPU0 CPU1 > ---- ---- > lock(&pp->lock); > lock(mask_lock); > lock(&pp->lock); > lock(&irq_desc_lock_class); Indeed. Back to irq_set_affinity(). I did not think about the more problematic issue with calling irq_set_affinity(): The per CPU mask is then used recursive, which is a bad idea. But looking deeper, your implementation breaks some other things, like e.g. the affinity coupling of interrupt threads on the MSI side. They have to be coupled to the effective affinity of the underlying hard interrupt. Solvable problem, but not in the driver. Aside of the fact that this hardware design is a trainwreck, which defeats the intended flexibility of MSI, solving this at the chip driver level ends up in a incomplete and hacky solution. Also as this is not the only IP block, which was cobbled together mindlessly that way, we really don't want to proliferate this horrorshow all over the place. This really wants a generic infrastructure which handles all aspects of this correctly once and for all of these hardware trainwrecks. That needs some thoughts and effort, but that's definitely preferred over a half baked driver specific hackery, which fiddles in the guts of the interrupt descriptors as it sees fit. From a layering POV, a interrupt chip driver should not even care about the fact that an interrupt descriptor exists. We've spent a lot of effort to decouple these things and just because C allows it, does not mean it's a correct or a good idea to ignore the layering. The approach you have taken has a massive downside: It forces all (32) MSI interrupts, which are demultiplexed from the underlying parent interrupt, to have the same affinity and the user space interface in /proc/irq/*/affinity becomes more than confusing. This is not what user space expects from an interface which controls the affinity of an individual interrupt. No user and no tool expects that it has to deal with conflicting settings for MSI interrupts, which are independent of each other. Tools like irqbalanced will be pretty unhappy about the behaviour and you have to teach them about the severe limitations of this. I'm absolutely not convinced that this solves more problems than it creates. In fact, you can achieve the very same outcome with a clean and straight forward ten line change: Instead of the hidden chained handlers, request regular interrupts for demultiplexing and give them proper names 'DWC_MSI_DEMUX-0-31', 'DWC_MSI_DEMUX-32-63'... That provides exactly the same mechanism w/o all the hackery and creates the same (solvable) problem vs. interrupt thread affinity, but provides an understandable user interface. No? If you really want to provide a useful and flexible mechanism to steer the affinity of the individual MSI interrupts, which are handled by each control block, then you have to think about a completely different approach. The underlying interrupt per control block will always be affine to a particular CPU. But it's not written in stone, that the demultiplexing interrupt actually has to handle the pending interrupts in the context of the demultiplexer, at least not for MSI interrupts, which are edge type and from the PCI device side considered 'fire and forget'. So right now the demultiplex handler does: bits = read_dbi(...); for_each_bit(bit, bits) generic_handle_domain_irq(domain, base + bit); generic_handle_irq_domain() gets the interrupt mapping for the hardware interrupt number and invokes the relevant flow handler directly from the same context, which means that the affinity of all those interrupts is bound to the affinity of the underlying demultiplex interrupt. Iignoring the devil in the details, this can also be handled by delegating the handling to a different CPU: bits = read_dbi(...); for_each_bit(bit, bits) generic_handle_irq_demux_domain(domain, base + bit); where generic_handle_demux_domain_irq() does: desc = irq_resolve_mapping(domain, hwirq); if (desc->target_cpu == smp_processor_id()) { handle_irq_desc(desc); } else { // Issue DWC ack() here? - See below irq_work_queue_on(&desc->irq_work, desc->target_cpu); } The generic irq_work handler does: desc = container_of(work, struct irq_desc, irq_work); handle_irq_desc(desc); That's obviously asynchronous, but in case of edge type MSI interrupts, that's not a problem at all. The real problems are: 1) Handling the pending bit of the interrupt in the DBI, i.e. what dw_pci_bottom_ack() does. 2) Life time of the interrupt descriptor 3) Slightly increased latency of the rerouted interrupts 4) Affinity of the demultiplex interrupt #1 Needs some thoughts. From the top of my head I think it would require that dw_pci_bottom_ack() is issued in the context of the demultiplex handler as it would otherwise be retriggered, but that just needs some research in the datasheet and experimental validation. The actual interrupt flow handler for the MSI interrupt is then not longer handle_edge_irq() as it should not issue the ack() again. From a correctness POV vs. interrupt handling of a edge type interrupt, that's perfectly fine. #2 Trivial enough to solve by flushing and invalidating the irq_work before shutdown/removal. #3 That might be an issue for some extreme scenarios, but in the general case I claim, that it does not matter at all. Those scenarios where it really matters can affine the interrupt to the affinity of the demultiplexing interrupt, which leads to #4 #4 That's a trivial problem to solve. There is absolutely no technical reason to make them hidden. Demultiplexing just works fine from the context of regular interrupts. There are some other minor issues like CPU hotplug, but I can't see anything truly complex in this design right now. Famous last words :) At the driver level this becomes really simple. Affinity changes are just selecting a target CPU and store it for trivial comparison to avoid a CPU mask check in the demultiplex path. No fiddling with sibling or parent interrupts, no locking issues. Thread affinity and other things just work as expected. I'm really tempted to play with that. Such infrastructure would allow to support PCI/multi-MSI on x86 without interrupt remapping, which is on the wishlist of a lot of people for many years. Thanks, tglx
Thanks for your detailed explanation and feedback, I am a bit confused about the #4 you mentioned here -> > 4) Affinity of the demultiplex interrupt Are you saying there is a chance to queue this demultiplexing IRQ event to the current running CPU ? would it fall into the first if clause ? > if (desc->target_cpu == smp_processor_id()) { > handle_irq_desc(desc); And that's really an approach worth to try, I will work on it. Thanks
On Fri, Mar 07 2025 at 19:10, Tsai Sung-Fu wrote: > Thanks for your detailed explanation and feedback, I am a bit confused about the > #4 you mentioned here -> > >> 4) Affinity of the demultiplex interrupt > > Are you saying there is a chance to queue this demultiplexing IRQ event > to the current running CPU ? The demultiplexing interrupt (currently a chained handler, which is hidden from /proc/irq/) stays at the affinity which the kernel decided to assign to it at startup. That means it can't be steered to a particual CPU and nobody knows to which CPU it is affine. You can only guess it from /proc/interrupts by observing where the associated demultiplex interrupts are affine to. So ideally you want to be able to affine the demultiplexing interrupt too. That requires to switch it to a regular interrupt for simplicity. We could expose those hidden chained handlers affinity too, but that needs some surgery vs. locking etc. > And that's really an approach worth to try, I will work on it. I've played with this on top of variant of Marc's changes to use MSI parent interrupts for such controllers too: https://lore.kernel.org/all/20241204124549.607054-1-maz@kernel.org/ A completely untested and largely uncompiled preview is here: https://tglx.de/~tglx/patches.tar The MSI parent parts are in flux. Marc will look at them in the next weeks, but I picked them up because it simplifies the whole business a lot. If you find bugs in that series, you can keep them :) Thanks, tglx
Running some basic tests with this patch ( https://tglx.de/~tglx/patches.tar ) applied on my device, at first glance, the affinity feature is working. I didn't run stress test to test the stability, and the Kernel version we used is a bit old, so I only applied change in this 2 patches -> Subject: genirq: Add interrupt redirection infrastructure -> Subject: PCI: dwc: Enable MSI affinity support And adding if check on irq_chip_redirect_set_affinity() and irq_set_redirect_target() to avoid cpumask_first() return nr_cpu_ids int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { .... .... if (target >= nr_cpu_ids) target = smp_processor_id(); irq_data_update_effective_affinity(data, cpumask_of(target)); } static void irq_set_redirect_target(struct irq_desc *desc) { const struct cpumask *m = irq_data_get_effective_affinity_mask(&desc->irq_data); unsigned int cpu = cpumask_first(m); if (cpu >= nr_cpu_ids) cpu = smp_processor_id(); WRITE_ONCE(desc->redirect.target_cpu, cpu); } May I ask, would this patch be officially added to the 6.14 kernel ? Thanks On Sat, Mar 8, 2025 at 3:49 AM Thomas Gleixner <tglx@linutronix.de> wrote: > > On Fri, Mar 07 2025 at 19:10, Tsai Sung-Fu wrote: > > Thanks for your detailed explanation and feedback, I am a bit confused about the > > #4 you mentioned here -> > > > >> 4) Affinity of the demultiplex interrupt > > > > Are you saying there is a chance to queue this demultiplexing IRQ event > > to the current running CPU ? > > The demultiplexing interrupt (currently a chained handler, which is > hidden from /proc/irq/) stays at the affinity which the kernel decided > to assign to it at startup. That means it can't be steered to a > particual CPU and nobody knows to which CPU it is affine. You can only > guess it from /proc/interrupts by observing where the associated > demultiplex interrupts are affine to. > > So ideally you want to be able to affine the demultiplexing interrupt > too. That requires to switch it to a regular interrupt for > simplicity. We could expose those hidden chained handlers affinity too, > but that needs some surgery vs. locking etc. > > > And that's really an approach worth to try, I will work on it. > > I've played with this on top of variant of Marc's changes to use MSI > parent interrupts for such controllers too: > > https://lore.kernel.org/all/20241204124549.607054-1-maz@kernel.org/ > > A completely untested and largely uncompiled preview is here: > > https://tglx.de/~tglx/patches.tar > > The MSI parent parts are in flux. Marc will look at them in the next > weeks, but I picked them up because it simplifies the whole business a > lot. If you find bugs in that series, you can keep them :) > > Thanks, > > tglx
On Tue, Mar 11 2025 at 17:52, Tsai Sung-Fu wrote: Please do not top-post and trim your replies. > Running some basic tests with this patch ( > https://tglx.de/~tglx/patches.tar ) applied on my device, at first > glance, the affinity feature is working. > > I didn't run stress test to test the stability, and the Kernel version > we used is a bit old, so I only applied change in this 2 patches I don't care about old kernels and what you can apply or not. Kernel development happens against upstream and not against randomly chosen private kernel versions. > And adding if check on irq_chip_redirect_set_affinity() and > irq_set_redirect_target() to avoid cpumask_first() return nr_cpu_ids I assume you know how diff works. > May I ask, would this patch be officially added to the 6.14 kernel ? You may ask. But you should know the answer already, no? The merge window for 6.14 closed on February 2nd with the release of 6.14-rc1. Anything which goes into Linus tree between rc1 and the final release is fixes only. This is new infrastructure, which has neither been posted nor reviewed nor properly tested. There are also no numbers about the overhead and no analysis whether that overhead causes regressions on existing setups. These changes want to be: 1) Put into a series with proper change logs 2) Posted on the relevant mailing list 3) Tested and proper numbers provided So they are not even close to be ready for the 6.15 merge window, simply because the irq tree is going to freeze at 6.14-rc7, i.e. by the end of this week. I'm not planning to work on them. Feel free to take the PoC patches, polish them up and post them according to the documented process. Thanks, tglx
diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index d2291c3ceb8be..4cab9f05c8813 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -49,8 +49,7 @@ static struct irq_chip dw_pcie_msi_irq_chip = { static struct msi_domain_info dw_pcie_msi_domain_info = { .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_NO_AFFINITY | MSI_FLAG_PCI_MSIX | - MSI_FLAG_MULTI_PCI_MSI, + MSI_FLAG_PCI_MSIX | MSI_FLAG_MULTI_PCI_MSI, .chip = &dw_pcie_msi_irq_chip, }; @@ -117,6 +116,125 @@ static void dw_pci_setup_msi_msg(struct irq_data *d, struct msi_msg *msg) (int)d->hwirq, msg->address_hi, msg->address_lo); } +/* + * The algo here honor if there is any intersection of mask of + * the existing MSI vectors and the requesting MSI vector. So we + * could handle both narrow (1 bit set mask) and wide (0xffff...) + * cases, return -EINVAL and reject the request if the result of + * cpumask is empty, otherwise return 0 and have the calculated + * result on the mask_to_check to pass down to the irq_chip. + */ +static int dw_pci_check_mask_compatibility(struct dw_pcie_rp *pp, + unsigned long ctrl, + unsigned long hwirq_to_check, + struct cpumask *mask_to_check) +{ + unsigned long end, hwirq; + const struct cpumask *mask; + unsigned int virq; + + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; + end = hwirq + MAX_MSI_IRQS_PER_CTRL; + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { + if (hwirq == hwirq_to_check) + continue; + virq = irq_find_mapping(pp->irq_domain, hwirq); + if (!virq) + continue; + mask = irq_get_affinity_mask(virq); + if (!cpumask_and(mask_to_check, mask, mask_to_check)) + return -EINVAL; + } + + return 0; +} + +static void dw_pci_update_effective_affinity(struct dw_pcie_rp *pp, + unsigned long ctrl, + const struct cpumask *effective_mask, + unsigned long hwirq_to_check) +{ + struct irq_desc *desc_downstream; + unsigned int virq_downstream; + unsigned long end, hwirq; + + /* + * Update all the irq_data's effective mask + * bind to this MSI controller, so the correct + * affinity would reflect on + * /proc/irq/XXX/effective_affinity + */ + hwirq = ctrl * MAX_MSI_IRQS_PER_CTRL; + end = hwirq + MAX_MSI_IRQS_PER_CTRL; + for_each_set_bit_from(hwirq, pp->msi_irq_in_use, end) { + virq_downstream = irq_find_mapping(pp->irq_domain, hwirq); + if (!virq_downstream) + continue; + desc_downstream = irq_to_desc(virq_downstream); + irq_data_update_effective_affinity(&desc_downstream->irq_data, + effective_mask); + } +} + +static int dw_pci_msi_set_affinity(struct irq_data *d, + const struct cpumask *mask, bool force) +{ + struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d); + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + int ret; + int virq_parent; + unsigned long hwirq = d->hwirq; + unsigned long flags, ctrl; + struct irq_desc *desc_parent; + const struct cpumask *effective_mask; + cpumask_var_t mask_result; + + ctrl = hwirq / MAX_MSI_IRQS_PER_CTRL; + if (!alloc_cpumask_var(&mask_result, GFP_ATOMIC)) + return -ENOMEM; + + /* + * Loop through all possible MSI vector to check if the + * requested one is compatible with all of them + */ + raw_spin_lock_irqsave(&pp->lock, flags); + cpumask_copy(mask_result, mask); + ret = dw_pci_check_mask_compatibility(pp, ctrl, hwirq, mask_result); + if (ret) { + dev_dbg(pci->dev, "Incompatible mask, request %*pbl, irq num %u\n", + cpumask_pr_args(mask), d->irq); + goto unlock; + } + + dev_dbg(pci->dev, "Final mask, request %*pbl, irq num %u\n", + cpumask_pr_args(mask_result), d->irq); + + virq_parent = pp->msi_irq[ctrl]; + desc_parent = irq_to_desc(virq_parent); + ret = desc_parent->irq_data.chip->irq_set_affinity(&desc_parent->irq_data, + mask_result, force); + + if (ret < 0) + goto unlock; + + switch (ret) { + case IRQ_SET_MASK_OK: + case IRQ_SET_MASK_OK_DONE: + cpumask_copy(desc_parent->irq_common_data.affinity, mask); + fallthrough; + case IRQ_SET_MASK_OK_NOCOPY: + break; + } + + effective_mask = irq_data_get_effective_affinity_mask(&desc_parent->irq_data); + dw_pci_update_effective_affinity(pp, ctrl, effective_mask, hwirq); + +unlock: + free_cpumask_var(mask_result); + raw_spin_unlock_irqrestore(&pp->lock, flags); + return ret < 0 ? ret : IRQ_SET_MASK_OK_NOCOPY; +} + static void dw_pci_bottom_mask(struct irq_data *d) { struct dw_pcie_rp *pp = irq_data_get_irq_chip_data(d); @@ -172,10 +290,14 @@ static struct irq_chip dw_pci_msi_bottom_irq_chip = { .name = "DWPCI-MSI", .irq_ack = dw_pci_bottom_ack, .irq_compose_msi_msg = dw_pci_setup_msi_msg, + .irq_set_affinity = dw_pci_msi_set_affinity, .irq_mask = dw_pci_bottom_mask, .irq_unmask = dw_pci_bottom_unmask, }; +static struct lock_class_key dw_pci_irq_lock_class; +static struct lock_class_key dw_pci_irq_request_class; + static int dw_pcie_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *args) @@ -195,11 +317,14 @@ static int dw_pcie_irq_domain_alloc(struct irq_domain *domain, if (bit < 0) return -ENOSPC; - for (i = 0; i < nr_irqs; i++) + for (i = 0; i < nr_irqs; i++) { + irq_set_lockdep_class(virq + i, &dw_pci_irq_lock_class, + &dw_pci_irq_request_class); irq_domain_set_info(domain, virq + i, bit + i, pp->msi_irq_chip, pp, handle_edge_irq, NULL, NULL); + } return 0; }