Message ID | 5f6ade76b47f033836d7a36c03555da896dfb4a3.1697824215.git.robin.murphy@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | perf/arm-cmn: Multi-DTC improvements | expand |
Hi Robin, I have one comment, otherwise the patch looks good to me. On Fri, 20 Oct 2023, Robin Murphy wrote: > The bitmap-based scheme for tracking DTC counter usage turns out to be a > complete dead-end for its imagined purpose, since by the time we have to > keep track of a per-DTC counter index anyway, we already have enough > information to make the bitmap itself redundant. Revert the remains of > it back to almost the original scheme, but now expanded to track per-DTC > indices, in preparation for making use of them in anger. > > Note that since cycle count events always use a dedicated counter on a > single DTC, we reuse the field to encode their DTC index directly. > > Signed-off-by: Robin Murphy <robin.murphy@arm.com> > --- > drivers/perf/arm-cmn.c | 126 +++++++++++++++++++++-------------------- > 1 file changed, 64 insertions(+), 62 deletions(-) > > diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c > index f1ac8d0cdb3b..675f1638013e 100644 > --- a/drivers/perf/arm-cmn.c > +++ b/drivers/perf/arm-cmn.c > @@ -281,16 +281,13 @@ struct arm_cmn_node { > u16 id, logid; > enum cmn_node_type type; > .... > @@ -589,8 +586,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {} > struct arm_cmn_hw_event { > struct arm_cmn_node *dn; > u64 dtm_idx[4]; > - unsigned int dtc_idx; > - u8 dtcs_used; > + s8 dtc_idx[CMN_MAX_DTCS]; > u8 num_dns; > u8 dtm_offset; > bool wide_sel; > @@ -600,6 +596,10 @@ struct arm_cmn_hw_event { > #define for_each_hw_dn(hw, dn, i) \ > for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++) > > +/* @i is the DTC number, @idx is the counter index on that DTC */ > +#define for_each_hw_dtc_idx(hw, i, idx) \ > + for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0) Isn't that "idx" unnecessary in the initialization? Cheers, Ilkka
On Fri, 20 Oct 2023, Robin Murphy wrote: > The bitmap-based scheme for tracking DTC counter usage turns out to be a > complete dead-end for its imagined purpose, since by the time we have to > keep track of a per-DTC counter index anyway, we already have enough > information to make the bitmap itself redundant. Revert the remains of > it back to almost the original scheme, but now expanded to track per-DTC > indices, in preparation for making use of them in anger. > > Note that since cycle count events always use a dedicated counter on a > single DTC, we reuse the field to encode their DTC index directly. > > Signed-off-by: Robin Murphy <robin.murphy@arm.com> Thanks! I had that on my task list but never had time to start working on it. Reviewed-by: Ilkka Koskinen <ilkka@os.amperecomputing.com> Cheers, Ilkka > --- > drivers/perf/arm-cmn.c | 126 +++++++++++++++++++++-------------------- > 1 file changed, 64 insertions(+), 62 deletions(-) > > diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c > index f1ac8d0cdb3b..675f1638013e 100644 > --- a/drivers/perf/arm-cmn.c > +++ b/drivers/perf/arm-cmn.c > @@ -281,16 +281,13 @@ struct arm_cmn_node { > u16 id, logid; > enum cmn_node_type type; > > - int dtm; > - union { > - /* DN/HN-F/CXHA */ > - struct { > - u8 val : 4; > - u8 count : 4; > - } occupid[SEL_MAX]; > - /* XP */ > - u8 dtc; > - }; > + u8 dtm; > + s8 dtc; > + /* DN/HN-F/CXHA */ > + struct { > + u8 val : 4; > + u8 count : 4; > + } occupid[SEL_MAX]; > union { > u8 event[4]; > __le32 event_sel; > @@ -540,12 +537,12 @@ static int arm_cmn_map_show(struct seq_file *s, void *data) > > seq_puts(s, "\n |"); > for (x = 0; x < cmn->mesh_x; x++) { > - u8 dtc = cmn->xps[xp_base + x].dtc; > + s8 dtc = cmn->xps[xp_base + x].dtc; > > - if (dtc & (dtc - 1)) > + if (dtc < 0) > seq_puts(s, " DTC ?? |"); > else > - seq_printf(s, " DTC %ld |", __ffs(dtc)); > + seq_printf(s, " DTC %d |", dtc); > } > seq_puts(s, "\n |"); > for (x = 0; x < cmn->mesh_x; x++) > @@ -589,8 +586,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {} > struct arm_cmn_hw_event { > struct arm_cmn_node *dn; > u64 dtm_idx[4]; > - unsigned int dtc_idx; > - u8 dtcs_used; > + s8 dtc_idx[CMN_MAX_DTCS]; > u8 num_dns; > u8 dtm_offset; > bool wide_sel; > @@ -600,6 +596,10 @@ struct arm_cmn_hw_event { > #define for_each_hw_dn(hw, dn, i) \ > for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++) > > +/* @i is the DTC number, @idx is the counter index on that DTC */ > +#define for_each_hw_dtc_idx(hw, i, idx) \ > + for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0) > + > static struct arm_cmn_hw_event *to_cmn_hw(struct perf_event *event) > { > BUILD_BUG_ON(sizeof(struct arm_cmn_hw_event) > offsetof(struct hw_perf_event, target)); > @@ -1429,12 +1429,11 @@ static void arm_cmn_init_counter(struct perf_event *event) > { > struct arm_cmn *cmn = to_cmn(event->pmu); > struct arm_cmn_hw_event *hw = to_cmn_hw(event); > - unsigned int i, pmevcnt = CMN_DT_PMEVCNT(hw->dtc_idx); > u64 count; > > - for (i = 0; hw->dtcs_used & (1U << i); i++) { > - writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + pmevcnt); > - cmn->dtc[i].counters[hw->dtc_idx] = event; > + for_each_hw_dtc_idx(hw, i, idx) { > + writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + CMN_DT_PMEVCNT(idx)); > + cmn->dtc[i].counters[idx] = event; > } > > count = arm_cmn_read_dtm(cmn, hw, false); > @@ -1447,11 +1446,9 @@ static void arm_cmn_event_read(struct perf_event *event) > struct arm_cmn_hw_event *hw = to_cmn_hw(event); > u64 delta, new, prev; > unsigned long flags; > - unsigned int i; > > - if (hw->dtc_idx == CMN_DT_NUM_COUNTERS) { > - i = __ffs(hw->dtcs_used); > - delta = arm_cmn_read_cc(cmn->dtc + i); > + if (CMN_EVENT_TYPE(event) == CMN_TYPE_DTC) { > + delta = arm_cmn_read_cc(cmn->dtc + hw->dtc_idx[0]); > local64_add(delta, &event->count); > return; > } > @@ -1461,8 +1458,8 @@ static void arm_cmn_event_read(struct perf_event *event) > delta = new - prev; > > local_irq_save(flags); > - for (i = 0; hw->dtcs_used & (1U << i); i++) { > - new = arm_cmn_read_counter(cmn->dtc + i, hw->dtc_idx); > + for_each_hw_dtc_idx(hw, i, idx) { > + new = arm_cmn_read_counter(cmn->dtc + i, idx); > delta += new << 16; > } > local_irq_restore(flags); > @@ -1518,7 +1515,7 @@ static void arm_cmn_event_start(struct perf_event *event, int flags) > int i; > > if (type == CMN_TYPE_DTC) { > - i = __ffs(hw->dtcs_used); > + i = hw->dtc_idx[0]; > writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR); > cmn->dtc[i].cc_active = true; > } else if (type == CMN_TYPE_WP) { > @@ -1549,7 +1546,7 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags) > int i; > > if (type == CMN_TYPE_DTC) { > - i = __ffs(hw->dtcs_used); > + i = hw->dtc_idx[0]; > cmn->dtc[i].cc_active = false; > } else if (type == CMN_TYPE_WP) { > int wp_idx = arm_cmn_wp_idx(event); > @@ -1735,12 +1732,19 @@ static int arm_cmn_event_init(struct perf_event *event) > hw->dn = arm_cmn_node(cmn, type); > if (!hw->dn) > return -EINVAL; > + > + memset(hw->dtc_idx, -1, sizeof(hw->dtc_idx)); > for (dn = hw->dn; dn->type == type; dn++) { > if (bynodeid && dn->id != nodeid) { > hw->dn++; > continue; > } > hw->num_dns++; > + if (dn->dtc < 0) > + memset(hw->dtc_idx, 0, cmn->num_dtcs); > + else > + hw->dtc_idx[dn->dtc] = 0; > + > if (bynodeid) > break; > } > @@ -1752,12 +1756,6 @@ static int arm_cmn_event_init(struct perf_event *event) > nodeid, nid.x, nid.y, nid.port, nid.dev, type); > return -EINVAL; > } > - /* > - * Keep assuming non-cycles events count in all DTC domains; turns out > - * it's hard to make a worthwhile optimisation around this, short of > - * going all-in with domain-local counter allocation as well. > - */ > - hw->dtcs_used = (1U << cmn->num_dtcs) - 1; > > return arm_cmn_validate_group(cmn, event); > } > @@ -1783,28 +1781,25 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event, > } > memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx)); > > - for (i = 0; hw->dtcs_used & (1U << i); i++) > - cmn->dtc[i].counters[hw->dtc_idx] = NULL; > + for_each_hw_dtc_idx(hw, j, idx) > + cmn->dtc[j].counters[idx] = NULL; > } > > static int arm_cmn_event_add(struct perf_event *event, int flags) > { > struct arm_cmn *cmn = to_cmn(event->pmu); > struct arm_cmn_hw_event *hw = to_cmn_hw(event); > - struct arm_cmn_dtc *dtc = &cmn->dtc[0]; > struct arm_cmn_node *dn; > enum cmn_node_type type = CMN_EVENT_TYPE(event); > - unsigned int i, dtc_idx, input_sel; > + unsigned int input_sel, i = 0; > > if (type == CMN_TYPE_DTC) { > - i = 0; > while (cmn->dtc[i].cycles) > if (++i == cmn->num_dtcs) > return -ENOSPC; > > cmn->dtc[i].cycles = event; > - hw->dtc_idx = CMN_DT_NUM_COUNTERS; > - hw->dtcs_used = 1U << i; > + hw->dtc_idx[0] = i; > > if (flags & PERF_EF_START) > arm_cmn_event_start(event, 0); > @@ -1812,17 +1807,22 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) > } > > /* Grab a free global counter first... */ > - dtc_idx = 0; > - while (dtc->counters[dtc_idx]) > - if (++dtc_idx == CMN_DT_NUM_COUNTERS) > - return -ENOSPC; > - > - hw->dtc_idx = dtc_idx; > + for_each_hw_dtc_idx(hw, j, idx) { > + if (j > 0) { > + idx = hw->dtc_idx[0]; > + } else { > + idx = 0; > + while (cmn->dtc[j].counters[idx]) > + if (++idx == CMN_DT_NUM_COUNTERS) > + goto free_dtms; > + } > + hw->dtc_idx[j] = idx; > + } > > /* ...then the local counters to feed it. */ > for_each_hw_dn(hw, dn, i) { > struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset; > - unsigned int dtm_idx, shift; > + unsigned int dtm_idx, shift, d = 0; > u64 reg; > > dtm_idx = 0; > @@ -1841,11 +1841,11 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) > > tmp = dtm->wp_event[wp_idx ^ 1]; > if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) != > - CMN_EVENT_WP_COMBINE(dtc->counters[tmp])) > + CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp])) > goto free_dtms; > > input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx; > - dtm->wp_event[wp_idx] = dtc_idx; > + dtm->wp_event[wp_idx] = hw->dtc_idx[d]; > writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx)); > } else { > struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); > @@ -1865,7 +1865,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) > dtm->input_sel[dtm_idx] = input_sel; > shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx); > dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift); > - dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift; > + dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, hw->dtc_idx[d]) << shift; > dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx); > reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low; > writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG); > @@ -1893,7 +1893,7 @@ static void arm_cmn_event_del(struct perf_event *event, int flags) > arm_cmn_event_stop(event, PERF_EF_UPDATE); > > if (type == CMN_TYPE_DTC) > - cmn->dtc[__ffs(hw->dtcs_used)].cycles = NULL; > + cmn->dtc[hw->dtc_idx[0]].cycles = NULL; > else > arm_cmn_event_clear(cmn, event, hw->num_dns); > } > @@ -2074,7 +2074,6 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn) > { > struct arm_cmn_node *dn, *xp; > int dtc_idx = 0; > - u8 dtcs_present = (1 << cmn->num_dtcs) - 1; > > cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL); > if (!cmn->dtc) > @@ -2084,23 +2083,26 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn) > > cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP); > > + if (cmn->part == PART_CMN600 && cmn->num_dtcs > 1) { > + /* We do at least know that a DTC's XP must be in that DTC's domain */ > + dn = arm_cmn_node(cmn, CMN_TYPE_DTC); > + for (int i = 0; i < cmn->num_dtcs; i++) > + arm_cmn_node_to_xp(cmn, dn + i)->dtc = i; > + } > + > for (dn = cmn->dns; dn->type; dn++) { > - if (dn->type == CMN_TYPE_XP) { > - dn->dtc &= dtcs_present; > + if (dn->type == CMN_TYPE_XP) > continue; > - } > > xp = arm_cmn_node_to_xp(cmn, dn); > + dn->dtc = xp->dtc; > dn->dtm = xp->dtm; > if (cmn->multi_dtm) > dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2; > > if (dn->type == CMN_TYPE_DTC) { > - int err; > - /* We do at least know that a DTC's XP must be in that DTC's domain */ > - if (xp->dtc == 0xf) > - xp->dtc = 1 << dtc_idx; > - err = arm_cmn_init_dtc(cmn, dn, dtc_idx++); > + int err = arm_cmn_init_dtc(cmn, dn, dtc_idx++); > + > if (err) > return err; > } > @@ -2258,9 +2260,9 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) > cmn->mesh_x = xp->logid; > > if (cmn->part == PART_CMN600) > - xp->dtc = 0xf; > + xp->dtc = -1; > else > - xp->dtc = 1 << arm_cmn_dtc_domain(cmn, xp_region); > + xp->dtc = arm_cmn_dtc_domain(cmn, xp_region); > > xp->dtm = dtm - cmn->dtms; > arm_cmn_init_dtm(dtm++, xp, 0); > -- > 2.39.2.101.g768bb238c484.dirty > >
On Fri, Oct 20, 2023 at 03:50:30PM -0700, Ilkka Koskinen wrote: > Hi Robin, > > I have one comment, otherwise the patch looks good to me. > > +/* @i is the DTC number, @idx is the counter index on that DTC */ > > +#define for_each_hw_dtc_idx(hw, i, idx) \ > > + for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0) > > Isn't that "idx" unnecessary in the initialization? That creates the 'idx' variable that's assigned to by `idx = hw->dtc_idx[i]`, so that is necessary. Mark.
On Fri, Oct 20, 2023 at 06:51:26PM +0100, Robin Murphy wrote: > The bitmap-based scheme for tracking DTC counter usage turns out to be a > complete dead-end for its imagined purpose, since by the time we have to > keep track of a per-DTC counter index anyway, we already have enough > information to make the bitmap itself redundant. Revert the remains of > it back to almost the original scheme, but now expanded to track per-DTC > indices, in preparation for making use of them in anger. > > Note that since cycle count events always use a dedicated counter on a > single DTC, we reuse the field to encode their DTC index directly. > > Signed-off-by: Robin Murphy <robin.murphy@arm.com> > --- > drivers/perf/arm-cmn.c | 126 +++++++++++++++++++++-------------------- > 1 file changed, 64 insertions(+), 62 deletions(-) > > diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c > index f1ac8d0cdb3b..675f1638013e 100644 > --- a/drivers/perf/arm-cmn.c > +++ b/drivers/perf/arm-cmn.c > @@ -281,16 +281,13 @@ struct arm_cmn_node { > u16 id, logid; > enum cmn_node_type type; > > - int dtm; > - union { > - /* DN/HN-F/CXHA */ > - struct { > - u8 val : 4; > - u8 count : 4; > - } occupid[SEL_MAX]; > - /* XP */ > - u8 dtc; > - }; > + u8 dtm; > + s8 dtc; > + /* DN/HN-F/CXHA */ > + struct { > + u8 val : 4; > + u8 count : 4; > + } occupid[SEL_MAX]; > union { > u8 event[4]; > __le32 event_sel; > @@ -540,12 +537,12 @@ static int arm_cmn_map_show(struct seq_file *s, void *data) > > seq_puts(s, "\n |"); > for (x = 0; x < cmn->mesh_x; x++) { > - u8 dtc = cmn->xps[xp_base + x].dtc; > + s8 dtc = cmn->xps[xp_base + x].dtc; > > - if (dtc & (dtc - 1)) > + if (dtc < 0) > seq_puts(s, " DTC ?? |"); > else > - seq_printf(s, " DTC %ld |", __ffs(dtc)); > + seq_printf(s, " DTC %d |", dtc); > } > seq_puts(s, "\n |"); > for (x = 0; x < cmn->mesh_x; x++) > @@ -589,8 +586,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {} > struct arm_cmn_hw_event { > struct arm_cmn_node *dn; > u64 dtm_idx[4]; > - unsigned int dtc_idx; > - u8 dtcs_used; > + s8 dtc_idx[CMN_MAX_DTCS]; > u8 num_dns; > u8 dtm_offset; > bool wide_sel; > @@ -600,6 +596,10 @@ struct arm_cmn_hw_event { > #define for_each_hw_dn(hw, dn, i) \ > for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++) > > +/* @i is the DTC number, @idx is the counter index on that DTC */ > +#define for_each_hw_dtc_idx(hw, i, idx) \ > + for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0) This macro is pretty hideous ;) The kbuild robot complained as well, but given that it's internal to the driver and it does make the callsites quite a bit simpler, I'm inclined to stick with it for now. At least, I couldn't come up with something else which was just as succinct. Will
On 2023-10-23 10:06, Mark Rutland wrote: > On Fri, Oct 20, 2023 at 03:50:30PM -0700, Ilkka Koskinen wrote: >> Hi Robin, >> >> I have one comment, otherwise the patch looks good to me. > >>> +/* @i is the DTC number, @idx is the counter index on that DTC */ >>> +#define for_each_hw_dtc_idx(hw, i, idx) \ >>> + for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0) >> >> Isn't that "idx" unnecessary in the initialization? > > That creates the 'idx' variable that's assigned to by `idx = hw->dtc_idx[i]`, > so that is necessary. Right, the intent is to take advantage of locally-scoped iterator variables since they're a nice thing, but completely-implicit definitions aren't so nice (at best they'd be surprising, at worst they could confusingly shadow existing variables in the outer scope), so the macro invoker still provides their names to at least give some visible context to their use within the loop body. I guess this is still a fairly new paradigm since the C11 switch, and I'm not aware of any "standard" style for such iterator macros yet, so I just did what seemed most logical. Thanks, Robin.
On Mon, 23 Oct 2023, Mark Rutland wrote: > On Fri, Oct 20, 2023 at 03:50:30PM -0700, Ilkka Koskinen wrote: >> Hi Robin, >> >> I have one comment, otherwise the patch looks good to me. > >>> +/* @i is the DTC number, @idx is the counter index on that DTC */ >>> +#define for_each_hw_dtc_idx(hw, i, idx) \ >>> + for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0) >> >> Isn't that "idx" unnecessary in the initialization? > > That creates the 'idx' variable that's assigned to by `idx = hw->dtc_idx[i]`, > so that is necessary. Uh, it clearly does that. I have absoutely no idea what I was thinking about here. Sorry about the confusion! --Ilkka > > Mark. >
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index f1ac8d0cdb3b..675f1638013e 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -281,16 +281,13 @@ struct arm_cmn_node { u16 id, logid; enum cmn_node_type type; - int dtm; - union { - /* DN/HN-F/CXHA */ - struct { - u8 val : 4; - u8 count : 4; - } occupid[SEL_MAX]; - /* XP */ - u8 dtc; - }; + u8 dtm; + s8 dtc; + /* DN/HN-F/CXHA */ + struct { + u8 val : 4; + u8 count : 4; + } occupid[SEL_MAX]; union { u8 event[4]; __le32 event_sel; @@ -540,12 +537,12 @@ static int arm_cmn_map_show(struct seq_file *s, void *data) seq_puts(s, "\n |"); for (x = 0; x < cmn->mesh_x; x++) { - u8 dtc = cmn->xps[xp_base + x].dtc; + s8 dtc = cmn->xps[xp_base + x].dtc; - if (dtc & (dtc - 1)) + if (dtc < 0) seq_puts(s, " DTC ?? |"); else - seq_printf(s, " DTC %ld |", __ffs(dtc)); + seq_printf(s, " DTC %d |", dtc); } seq_puts(s, "\n |"); for (x = 0; x < cmn->mesh_x; x++) @@ -589,8 +586,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {} struct arm_cmn_hw_event { struct arm_cmn_node *dn; u64 dtm_idx[4]; - unsigned int dtc_idx; - u8 dtcs_used; + s8 dtc_idx[CMN_MAX_DTCS]; u8 num_dns; u8 dtm_offset; bool wide_sel; @@ -600,6 +596,10 @@ struct arm_cmn_hw_event { #define for_each_hw_dn(hw, dn, i) \ for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++) +/* @i is the DTC number, @idx is the counter index on that DTC */ +#define for_each_hw_dtc_idx(hw, i, idx) \ + for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0) + static struct arm_cmn_hw_event *to_cmn_hw(struct perf_event *event) { BUILD_BUG_ON(sizeof(struct arm_cmn_hw_event) > offsetof(struct hw_perf_event, target)); @@ -1429,12 +1429,11 @@ static void arm_cmn_init_counter(struct perf_event *event) { struct arm_cmn *cmn = to_cmn(event->pmu); struct arm_cmn_hw_event *hw = to_cmn_hw(event); - unsigned int i, pmevcnt = CMN_DT_PMEVCNT(hw->dtc_idx); u64 count; - for (i = 0; hw->dtcs_used & (1U << i); i++) { - writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + pmevcnt); - cmn->dtc[i].counters[hw->dtc_idx] = event; + for_each_hw_dtc_idx(hw, i, idx) { + writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + CMN_DT_PMEVCNT(idx)); + cmn->dtc[i].counters[idx] = event; } count = arm_cmn_read_dtm(cmn, hw, false); @@ -1447,11 +1446,9 @@ static void arm_cmn_event_read(struct perf_event *event) struct arm_cmn_hw_event *hw = to_cmn_hw(event); u64 delta, new, prev; unsigned long flags; - unsigned int i; - if (hw->dtc_idx == CMN_DT_NUM_COUNTERS) { - i = __ffs(hw->dtcs_used); - delta = arm_cmn_read_cc(cmn->dtc + i); + if (CMN_EVENT_TYPE(event) == CMN_TYPE_DTC) { + delta = arm_cmn_read_cc(cmn->dtc + hw->dtc_idx[0]); local64_add(delta, &event->count); return; } @@ -1461,8 +1458,8 @@ static void arm_cmn_event_read(struct perf_event *event) delta = new - prev; local_irq_save(flags); - for (i = 0; hw->dtcs_used & (1U << i); i++) { - new = arm_cmn_read_counter(cmn->dtc + i, hw->dtc_idx); + for_each_hw_dtc_idx(hw, i, idx) { + new = arm_cmn_read_counter(cmn->dtc + i, idx); delta += new << 16; } local_irq_restore(flags); @@ -1518,7 +1515,7 @@ static void arm_cmn_event_start(struct perf_event *event, int flags) int i; if (type == CMN_TYPE_DTC) { - i = __ffs(hw->dtcs_used); + i = hw->dtc_idx[0]; writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR); cmn->dtc[i].cc_active = true; } else if (type == CMN_TYPE_WP) { @@ -1549,7 +1546,7 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags) int i; if (type == CMN_TYPE_DTC) { - i = __ffs(hw->dtcs_used); + i = hw->dtc_idx[0]; cmn->dtc[i].cc_active = false; } else if (type == CMN_TYPE_WP) { int wp_idx = arm_cmn_wp_idx(event); @@ -1735,12 +1732,19 @@ static int arm_cmn_event_init(struct perf_event *event) hw->dn = arm_cmn_node(cmn, type); if (!hw->dn) return -EINVAL; + + memset(hw->dtc_idx, -1, sizeof(hw->dtc_idx)); for (dn = hw->dn; dn->type == type; dn++) { if (bynodeid && dn->id != nodeid) { hw->dn++; continue; } hw->num_dns++; + if (dn->dtc < 0) + memset(hw->dtc_idx, 0, cmn->num_dtcs); + else + hw->dtc_idx[dn->dtc] = 0; + if (bynodeid) break; } @@ -1752,12 +1756,6 @@ static int arm_cmn_event_init(struct perf_event *event) nodeid, nid.x, nid.y, nid.port, nid.dev, type); return -EINVAL; } - /* - * Keep assuming non-cycles events count in all DTC domains; turns out - * it's hard to make a worthwhile optimisation around this, short of - * going all-in with domain-local counter allocation as well. - */ - hw->dtcs_used = (1U << cmn->num_dtcs) - 1; return arm_cmn_validate_group(cmn, event); } @@ -1783,28 +1781,25 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event, } memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx)); - for (i = 0; hw->dtcs_used & (1U << i); i++) - cmn->dtc[i].counters[hw->dtc_idx] = NULL; + for_each_hw_dtc_idx(hw, j, idx) + cmn->dtc[j].counters[idx] = NULL; } static int arm_cmn_event_add(struct perf_event *event, int flags) { struct arm_cmn *cmn = to_cmn(event->pmu); struct arm_cmn_hw_event *hw = to_cmn_hw(event); - struct arm_cmn_dtc *dtc = &cmn->dtc[0]; struct arm_cmn_node *dn; enum cmn_node_type type = CMN_EVENT_TYPE(event); - unsigned int i, dtc_idx, input_sel; + unsigned int input_sel, i = 0; if (type == CMN_TYPE_DTC) { - i = 0; while (cmn->dtc[i].cycles) if (++i == cmn->num_dtcs) return -ENOSPC; cmn->dtc[i].cycles = event; - hw->dtc_idx = CMN_DT_NUM_COUNTERS; - hw->dtcs_used = 1U << i; + hw->dtc_idx[0] = i; if (flags & PERF_EF_START) arm_cmn_event_start(event, 0); @@ -1812,17 +1807,22 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) } /* Grab a free global counter first... */ - dtc_idx = 0; - while (dtc->counters[dtc_idx]) - if (++dtc_idx == CMN_DT_NUM_COUNTERS) - return -ENOSPC; - - hw->dtc_idx = dtc_idx; + for_each_hw_dtc_idx(hw, j, idx) { + if (j > 0) { + idx = hw->dtc_idx[0]; + } else { + idx = 0; + while (cmn->dtc[j].counters[idx]) + if (++idx == CMN_DT_NUM_COUNTERS) + goto free_dtms; + } + hw->dtc_idx[j] = idx; + } /* ...then the local counters to feed it. */ for_each_hw_dn(hw, dn, i) { struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset; - unsigned int dtm_idx, shift; + unsigned int dtm_idx, shift, d = 0; u64 reg; dtm_idx = 0; @@ -1841,11 +1841,11 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) tmp = dtm->wp_event[wp_idx ^ 1]; if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) != - CMN_EVENT_WP_COMBINE(dtc->counters[tmp])) + CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp])) goto free_dtms; input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx; - dtm->wp_event[wp_idx] = dtc_idx; + dtm->wp_event[wp_idx] = hw->dtc_idx[d]; writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx)); } else { struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id); @@ -1865,7 +1865,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) dtm->input_sel[dtm_idx] = input_sel; shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx); dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift); - dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift; + dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, hw->dtc_idx[d]) << shift; dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx); reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low; writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG); @@ -1893,7 +1893,7 @@ static void arm_cmn_event_del(struct perf_event *event, int flags) arm_cmn_event_stop(event, PERF_EF_UPDATE); if (type == CMN_TYPE_DTC) - cmn->dtc[__ffs(hw->dtcs_used)].cycles = NULL; + cmn->dtc[hw->dtc_idx[0]].cycles = NULL; else arm_cmn_event_clear(cmn, event, hw->num_dns); } @@ -2074,7 +2074,6 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn) { struct arm_cmn_node *dn, *xp; int dtc_idx = 0; - u8 dtcs_present = (1 << cmn->num_dtcs) - 1; cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL); if (!cmn->dtc) @@ -2084,23 +2083,26 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn) cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP); + if (cmn->part == PART_CMN600 && cmn->num_dtcs > 1) { + /* We do at least know that a DTC's XP must be in that DTC's domain */ + dn = arm_cmn_node(cmn, CMN_TYPE_DTC); + for (int i = 0; i < cmn->num_dtcs; i++) + arm_cmn_node_to_xp(cmn, dn + i)->dtc = i; + } + for (dn = cmn->dns; dn->type; dn++) { - if (dn->type == CMN_TYPE_XP) { - dn->dtc &= dtcs_present; + if (dn->type == CMN_TYPE_XP) continue; - } xp = arm_cmn_node_to_xp(cmn, dn); + dn->dtc = xp->dtc; dn->dtm = xp->dtm; if (cmn->multi_dtm) dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2; if (dn->type == CMN_TYPE_DTC) { - int err; - /* We do at least know that a DTC's XP must be in that DTC's domain */ - if (xp->dtc == 0xf) - xp->dtc = 1 << dtc_idx; - err = arm_cmn_init_dtc(cmn, dn, dtc_idx++); + int err = arm_cmn_init_dtc(cmn, dn, dtc_idx++); + if (err) return err; } @@ -2258,9 +2260,9 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) cmn->mesh_x = xp->logid; if (cmn->part == PART_CMN600) - xp->dtc = 0xf; + xp->dtc = -1; else - xp->dtc = 1 << arm_cmn_dtc_domain(cmn, xp_region); + xp->dtc = arm_cmn_dtc_domain(cmn, xp_region); xp->dtm = dtm - cmn->dtms; arm_cmn_init_dtm(dtm++, xp, 0);
The bitmap-based scheme for tracking DTC counter usage turns out to be a complete dead-end for its imagined purpose, since by the time we have to keep track of a per-DTC counter index anyway, we already have enough information to make the bitmap itself redundant. Revert the remains of it back to almost the original scheme, but now expanded to track per-DTC indices, in preparation for making use of them in anger. Note that since cycle count events always use a dedicated counter on a single DTC, we reuse the field to encode their DTC index directly. Signed-off-by: Robin Murphy <robin.murphy@arm.com> --- drivers/perf/arm-cmn.c | 126 +++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 62 deletions(-)