Message ID | 20210809131057.1694145-13-danielhb413@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | PMU-EBB support for PPC64 TCG | expand |
On Mon, Aug 09, 2021 at 10:10:50AM -0300, Daniel Henrique Barboza wrote: > This patch starts the counter negative EBB support by enabling PMC1 > counter negative condition. > > A counter negative condition happens when a performance monitor counter > reaches the value 0x80000000. When that happens, if a counter negative > condition is enabled in that counter, a performance monitor alert is > triggered. For PMC1, this condition is enabled by MMCR0_PMC1CE. > > An icount-based logic is used to predict when we need to wake up the timer > to trigger the alert in both PM_INST_CMPL (0x2) and PM_CYC (0x1E) events. > The timer callback will then trigger a PPC_INTERRUPT_PMC which will become a > event-based exception later. > > Some EBB powerpc kernel selftests are passing after this patch, but a > substancial amount of them relies on other PMCs to be enabled and events > that we don't support at this moment. We'll address that in the next > patches. > > Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> > --- > target/ppc/cpu.h | 1 + > target/ppc/pmu_book3s_helper.c | 127 +++++++++++++++++++++++---------- > 2 files changed, 92 insertions(+), 36 deletions(-) > > diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h > index 1d38b8cf7a..5c81d459f4 100644 > --- a/target/ppc/cpu.h > +++ b/target/ppc/cpu.h > @@ -350,6 +350,7 @@ typedef struct ppc_v3_pate_t { > #define MMCR0_EBE PPC_BIT(43) /* Perf Monitor EBB Enable */ > #define MMCR0_FCECE PPC_BIT(38) /* FC on Enabled Cond or Event */ > #define MMCR0_PMCC PPC_BITMASK(44, 45) /* PMC Control */ > +#define MMCR0_PMC1CE PPC_BIT(48) > > #define MMCR1_PMC1SEL_SHIFT (63 - 39) > #define MMCR1_PMC1SEL PPC_BITMASK(32, 39) > diff --git a/target/ppc/pmu_book3s_helper.c b/target/ppc/pmu_book3s_helper.c > index 43cc0eb722..58ae65e22b 100644 > --- a/target/ppc/pmu_book3s_helper.c > +++ b/target/ppc/pmu_book3s_helper.c > @@ -25,6 +25,7 @@ > * and SPAPR code. > */ > #define PPC_CPU_FREQ 1000000000 > +#define COUNTER_NEGATIVE_VAL 0x80000000 > > static uint64_t get_cycles(uint64_t icount_delta) > { > @@ -32,22 +33,9 @@ static uint64_t get_cycles(uint64_t icount_delta) > NANOSECONDS_PER_SECOND); > } > > -static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, > - uint64_t icount_delta) > -{ > - env->spr[sprn] += icount_delta; > -} > - > -static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, > - uint64_t icount_delta) > -{ > - env->spr[sprn] += get_cycles(icount_delta); > -} > - > -static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, > - uint64_t icount_delta) > +static uint8_t get_PMC_event(CPUPPCState *env, int sprn) > { > - int event; > + int event = 0x0; > > switch (sprn) { > case SPR_POWER_PMC1: > @@ -65,11 +53,35 @@ static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, > case SPR_POWER_PMC4: > event = MMCR1_PMC4SEL & env->spr[SPR_POWER_MMCR1]; > break; > + case SPR_POWER_PMC5: > + event = 0x2; > + break; > + case SPR_POWER_PMC6: > + event = 0x1E; > + break; This looks like a nice cleanup that would be better folded into an earlier patch. > default: > - return; > + break; > } > > - switch (event) { > + return event; > +} > + > +static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, > + uint64_t icount_delta) > +{ > + env->spr[sprn] += icount_delta; > +} > + > +static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, > + uint64_t icount_delta) > +{ > + env->spr[sprn] += get_cycles(icount_delta); > +} > + > +static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, > + uint64_t icount_delta) > +{ > + switch (get_PMC_event(env, sprn)) { > case 0x2: > update_PMC_PM_INST_CMPL(env, sprn, icount_delta); > break; > @@ -99,30 +111,57 @@ static void update_PMCs(CPUPPCState *env, uint64_t icount_delta) > update_PMC_PM_CYC(env, SPR_POWER_PMC6, icount_delta); > } > > +static void set_PMU_excp_timer(CPUPPCState *env) > +{ > + uint64_t timeout, now, remaining_val; > + > + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_PMC1CE)) { > + return; > + } > + > + remaining_val = COUNTER_NEGATIVE_VAL - env->spr[SPR_POWER_PMC1]; > + > + switch (get_PMC_event(env, SPR_POWER_PMC1)) { > + case 0x2: > + timeout = icount_to_ns(remaining_val); > + break; > + case 0x1e: > + timeout = muldiv64(remaining_val, NANOSECONDS_PER_SECOND, > + PPC_CPU_FREQ); So.. this appears to be simulating to the guest that cycles are occurring at a constant rate, consistent with the advertised CPU frequency. Which sounds right, except... it's not clear to me that you're using the same logic to generate the values you read from the cycles PMC (in that case it shouldn't need to reference icount at all, right?). > + break; > + default: > + return; > + } > + > + now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); > + > + timer_mod(env->pmu_intr_timer, now + timeout); > +} > + > static void cpu_ppc_pmu_timer_cb(void *opaque) > { > PowerPCCPU *cpu = opaque; > CPUPPCState *env = &cpu->env; > - uint64_t mmcr0; > - > - mmcr0 = env->spr[SPR_POWER_MMCR0]; > - if (env->spr[SPR_POWER_MMCR0] & MMCR0_EBE) { > - /* freeeze counters if needed */ > - if (mmcr0 & MMCR0_FCECE) { > - mmcr0 &= ~MMCR0_FCECE; > - mmcr0 |= MMCR0_FC; > - } > + uint64_t icount_delta = (uint64_t)icount_get_raw() - env->pmu_base_icount; > > - /* Clear PMAE and set PMAO */ > - if (mmcr0 & MMCR0_PMAE) { > - mmcr0 &= ~MMCR0_PMAE; > - mmcr0 |= MMCR0_PMAO; > - } > - env->spr[SPR_POWER_MMCR0] = mmcr0; > + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_EBE)) { > + return; > + } > + > + update_PMCs(env, icount_delta); > + > + if (env->spr[SPR_POWER_MMCR0] & MMCR0_FCECE) { > + env->spr[SPR_POWER_MMCR0] &= ~MMCR0_FCECE; > + env->spr[SPR_POWER_MMCR0] |= MMCR0_FC; > + } > > - /* Fire the PMC hardware exception */ > - ppc_set_irq(cpu, PPC_INTERRUPT_PMC, 1); > + if (env->spr[SPR_POWER_MMCR0] & MMCR0_PMAE) { > + env->spr[SPR_POWER_MMCR0] &= ~MMCR0_PMAE; > + env->spr[SPR_POWER_MMCR0] |= MMCR0_PMAO; > } > + > + /* Fire the PMC hardware exception */ > + ppc_set_irq(cpu, PPC_INTERRUPT_PMC, 1); > } > > void cpu_ppc_pmu_timer_init(CPUPPCState *env) > @@ -134,12 +173,19 @@ void cpu_ppc_pmu_timer_init(CPUPPCState *env) > env->pmu_intr_timer = timer; > } > > +static bool mmcr0_counter_neg_cond_enabled(uint64_t mmcr0) > +{ > + return mmcr0 & MMCR0_PMC1CE; > +} > + > void helper_store_mmcr0(CPUPPCState *env, target_ulong value) > { > uint64_t curr_icount = (uint64_t)icount_get_raw(); > bool curr_FC = env->spr[SPR_POWER_MMCR0] & MMCR0_FC; > bool new_FC = value & MMCR0_FC; > > + env->spr[SPR_POWER_MMCR0] = value; > + > /* > * In an frozen count (FC) bit change: > * > @@ -163,10 +209,19 @@ void helper_store_mmcr0(CPUPPCState *env, target_ulong value) > * until the freeze. > */ > update_PMCs(env, icount_delta); > + > + /* delete pending timer */ > + timer_del(env->pmu_intr_timer); > } else { > env->pmu_base_icount = curr_icount; > + > + /* > + * Start performance monitor alert timer for counter negative > + * events, if needed. > + */ > + if (mmcr0_counter_neg_cond_enabled(env->spr[SPR_POWER_MMCR0])) { > + set_PMU_excp_timer(env); > + } > } > } > - > - env->spr[SPR_POWER_MMCR0] = value; > }
On 8/10/21 1:01 AM, David Gibson wrote: > On Mon, Aug 09, 2021 at 10:10:50AM -0300, Daniel Henrique Barboza wrote: >> This patch starts the counter negative EBB support by enabling PMC1 >> counter negative condition. >> >> A counter negative condition happens when a performance monitor counter >> reaches the value 0x80000000. When that happens, if a counter negative >> condition is enabled in that counter, a performance monitor alert is >> triggered. For PMC1, this condition is enabled by MMCR0_PMC1CE. >> >> An icount-based logic is used to predict when we need to wake up the timer >> to trigger the alert in both PM_INST_CMPL (0x2) and PM_CYC (0x1E) events. >> The timer callback will then trigger a PPC_INTERRUPT_PMC which will become a >> event-based exception later. >> >> Some EBB powerpc kernel selftests are passing after this patch, but a >> substancial amount of them relies on other PMCs to be enabled and events >> that we don't support at this moment. We'll address that in the next >> patches. >> >> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> >> --- >> target/ppc/cpu.h | 1 + >> target/ppc/pmu_book3s_helper.c | 127 +++++++++++++++++++++++---------- >> 2 files changed, 92 insertions(+), 36 deletions(-) >> >> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h >> index 1d38b8cf7a..5c81d459f4 100644 >> --- a/target/ppc/cpu.h >> +++ b/target/ppc/cpu.h >> @@ -350,6 +350,7 @@ typedef struct ppc_v3_pate_t { >> #define MMCR0_EBE PPC_BIT(43) /* Perf Monitor EBB Enable */ >> #define MMCR0_FCECE PPC_BIT(38) /* FC on Enabled Cond or Event */ >> #define MMCR0_PMCC PPC_BITMASK(44, 45) /* PMC Control */ >> +#define MMCR0_PMC1CE PPC_BIT(48) >> >> #define MMCR1_PMC1SEL_SHIFT (63 - 39) >> #define MMCR1_PMC1SEL PPC_BITMASK(32, 39) >> diff --git a/target/ppc/pmu_book3s_helper.c b/target/ppc/pmu_book3s_helper.c >> index 43cc0eb722..58ae65e22b 100644 >> --- a/target/ppc/pmu_book3s_helper.c >> +++ b/target/ppc/pmu_book3s_helper.c >> @@ -25,6 +25,7 @@ >> * and SPAPR code. >> */ >> #define PPC_CPU_FREQ 1000000000 >> +#define COUNTER_NEGATIVE_VAL 0x80000000 >> >> static uint64_t get_cycles(uint64_t icount_delta) >> { >> @@ -32,22 +33,9 @@ static uint64_t get_cycles(uint64_t icount_delta) >> NANOSECONDS_PER_SECOND); >> } >> >> -static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, >> - uint64_t icount_delta) >> -{ >> - env->spr[sprn] += icount_delta; >> -} >> - >> -static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, >> - uint64_t icount_delta) >> -{ >> - env->spr[sprn] += get_cycles(icount_delta); >> -} >> - >> -static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, >> - uint64_t icount_delta) >> +static uint8_t get_PMC_event(CPUPPCState *env, int sprn) >> { >> - int event; >> + int event = 0x0; >> >> switch (sprn) { >> case SPR_POWER_PMC1: >> @@ -65,11 +53,35 @@ static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, >> case SPR_POWER_PMC4: >> event = MMCR1_PMC4SEL & env->spr[SPR_POWER_MMCR1]; >> break; >> + case SPR_POWER_PMC5: >> + event = 0x2; >> + break; >> + case SPR_POWER_PMC6: >> + event = 0x1E; >> + break; > > This looks like a nice cleanup that would be better folded into an > earlier patch. > >> default: >> - return; >> + break; >> } >> >> - switch (event) { >> + return event; >> +} >> + >> +static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, >> + uint64_t icount_delta) >> +{ >> + env->spr[sprn] += icount_delta; >> +} >> + >> +static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, >> + uint64_t icount_delta) >> +{ >> + env->spr[sprn] += get_cycles(icount_delta); >> +} >> + >> +static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, >> + uint64_t icount_delta) >> +{ >> + switch (get_PMC_event(env, sprn)) { >> case 0x2: >> update_PMC_PM_INST_CMPL(env, sprn, icount_delta); >> break; >> @@ -99,30 +111,57 @@ static void update_PMCs(CPUPPCState *env, uint64_t icount_delta) >> update_PMC_PM_CYC(env, SPR_POWER_PMC6, icount_delta); >> } >> >> +static void set_PMU_excp_timer(CPUPPCState *env) >> +{ >> + uint64_t timeout, now, remaining_val; >> + >> + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_PMC1CE)) { >> + return; >> + } >> + >> + remaining_val = COUNTER_NEGATIVE_VAL - env->spr[SPR_POWER_PMC1]; >> + >> + switch (get_PMC_event(env, SPR_POWER_PMC1)) { >> + case 0x2: >> + timeout = icount_to_ns(remaining_val); >> + break; >> + case 0x1e: >> + timeout = muldiv64(remaining_val, NANOSECONDS_PER_SECOND, >> + PPC_CPU_FREQ); > > So.. this appears to be simulating to the guest that cycles are > occurring at a constant rate, consistent with the advertised CPU > frequency. Which sounds right, except... it's not clear to me that > you're using the same logic to generate the values you read from the > cycles PMC (in that case it shouldn't need to reference icount at all, > right?). 'remaining_val' meaning depends on the event being sampled in the PMC in that moment. PMCs 1 to 4 can have a multitude of events, PMC5 is always count instructions and PMC6 is always counting cycles. For 0x02, env->spr[SPR_POWER_PMC1] contains instructions. remaining_val is the remaining insns for the counter negative condition, and icount_to_ns() is the timeout estimation for that. The value of the PMC1 will be set via update_PMC_PM_INST_CMPL(), which in turn is just a matter of summing the elapsed icount delta between start and freeze into the PMC. For 0x1e, env->spr[SPR_POWER_PMC1] contains cycles. remaining_val is the remaining cycles for counter negative cycles, and this muldiv64 calc is the timeout estimation in this case. The PMC value is set via update_PMC_PM_CYC(), which in turn does a math with the current icount delta in get_cycles(icount_delta) to get the current PMC value. All the metrics implemented in this PMU relies on 'icount_delta', the amount of icount units between the change states of MMCR0_FC (and other freeze counter bits like patch 17). Thanks, Daniel > >> + break; >> + default: >> + return; >> + } >> + >> + now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); >> + >> + timer_mod(env->pmu_intr_timer, now + timeout); >> +} >> + >> static void cpu_ppc_pmu_timer_cb(void *opaque) >> { >> PowerPCCPU *cpu = opaque; >> CPUPPCState *env = &cpu->env; >> - uint64_t mmcr0; >> - >> - mmcr0 = env->spr[SPR_POWER_MMCR0]; >> - if (env->spr[SPR_POWER_MMCR0] & MMCR0_EBE) { >> - /* freeeze counters if needed */ >> - if (mmcr0 & MMCR0_FCECE) { >> - mmcr0 &= ~MMCR0_FCECE; >> - mmcr0 |= MMCR0_FC; >> - } >> + uint64_t icount_delta = (uint64_t)icount_get_raw() - env->pmu_base_icount; >> >> - /* Clear PMAE and set PMAO */ >> - if (mmcr0 & MMCR0_PMAE) { >> - mmcr0 &= ~MMCR0_PMAE; >> - mmcr0 |= MMCR0_PMAO; >> - } >> - env->spr[SPR_POWER_MMCR0] = mmcr0; >> + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_EBE)) { >> + return; >> + } >> + >> + update_PMCs(env, icount_delta); >> + >> + if (env->spr[SPR_POWER_MMCR0] & MMCR0_FCECE) { >> + env->spr[SPR_POWER_MMCR0] &= ~MMCR0_FCECE; >> + env->spr[SPR_POWER_MMCR0] |= MMCR0_FC; >> + } >> >> - /* Fire the PMC hardware exception */ >> - ppc_set_irq(cpu, PPC_INTERRUPT_PMC, 1); >> + if (env->spr[SPR_POWER_MMCR0] & MMCR0_PMAE) { >> + env->spr[SPR_POWER_MMCR0] &= ~MMCR0_PMAE; >> + env->spr[SPR_POWER_MMCR0] |= MMCR0_PMAO; >> } >> + >> + /* Fire the PMC hardware exception */ >> + ppc_set_irq(cpu, PPC_INTERRUPT_PMC, 1); >> } >> >> void cpu_ppc_pmu_timer_init(CPUPPCState *env) >> @@ -134,12 +173,19 @@ void cpu_ppc_pmu_timer_init(CPUPPCState *env) >> env->pmu_intr_timer = timer; >> } >> >> +static bool mmcr0_counter_neg_cond_enabled(uint64_t mmcr0) >> +{ >> + return mmcr0 & MMCR0_PMC1CE; >> +} >> + >> void helper_store_mmcr0(CPUPPCState *env, target_ulong value) >> { >> uint64_t curr_icount = (uint64_t)icount_get_raw(); >> bool curr_FC = env->spr[SPR_POWER_MMCR0] & MMCR0_FC; >> bool new_FC = value & MMCR0_FC; >> >> + env->spr[SPR_POWER_MMCR0] = value; >> + >> /* >> * In an frozen count (FC) bit change: >> * >> @@ -163,10 +209,19 @@ void helper_store_mmcr0(CPUPPCState *env, target_ulong value) >> * until the freeze. >> */ >> update_PMCs(env, icount_delta); >> + >> + /* delete pending timer */ >> + timer_del(env->pmu_intr_timer); >> } else { >> env->pmu_base_icount = curr_icount; >> + >> + /* >> + * Start performance monitor alert timer for counter negative >> + * events, if needed. >> + */ >> + if (mmcr0_counter_neg_cond_enabled(env->spr[SPR_POWER_MMCR0])) { >> + set_PMU_excp_timer(env); >> + } >> } >> } >> - >> - env->spr[SPR_POWER_MMCR0] = value; >> } >
On Tue, Aug 10, 2021 at 05:26:09PM -0300, Daniel Henrique Barboza wrote: > > > On 8/10/21 1:01 AM, David Gibson wrote: > > On Mon, Aug 09, 2021 at 10:10:50AM -0300, Daniel Henrique Barboza wrote: > > > This patch starts the counter negative EBB support by enabling PMC1 > > > counter negative condition. > > > > > > A counter negative condition happens when a performance monitor counter > > > reaches the value 0x80000000. When that happens, if a counter negative > > > condition is enabled in that counter, a performance monitor alert is > > > triggered. For PMC1, this condition is enabled by MMCR0_PMC1CE. > > > > > > An icount-based logic is used to predict when we need to wake up the timer > > > to trigger the alert in both PM_INST_CMPL (0x2) and PM_CYC (0x1E) events. > > > The timer callback will then trigger a PPC_INTERRUPT_PMC which will become a > > > event-based exception later. > > > > > > Some EBB powerpc kernel selftests are passing after this patch, but a > > > substancial amount of them relies on other PMCs to be enabled and events > > > that we don't support at this moment. We'll address that in the next > > > patches. > > > > > > Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> > > > --- > > > target/ppc/cpu.h | 1 + > > > target/ppc/pmu_book3s_helper.c | 127 +++++++++++++++++++++++---------- > > > 2 files changed, 92 insertions(+), 36 deletions(-) > > > > > > diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h > > > index 1d38b8cf7a..5c81d459f4 100644 > > > --- a/target/ppc/cpu.h > > > +++ b/target/ppc/cpu.h > > > @@ -350,6 +350,7 @@ typedef struct ppc_v3_pate_t { > > > #define MMCR0_EBE PPC_BIT(43) /* Perf Monitor EBB Enable */ > > > #define MMCR0_FCECE PPC_BIT(38) /* FC on Enabled Cond or Event */ > > > #define MMCR0_PMCC PPC_BITMASK(44, 45) /* PMC Control */ > > > +#define MMCR0_PMC1CE PPC_BIT(48) > > > #define MMCR1_PMC1SEL_SHIFT (63 - 39) > > > #define MMCR1_PMC1SEL PPC_BITMASK(32, 39) > > > diff --git a/target/ppc/pmu_book3s_helper.c b/target/ppc/pmu_book3s_helper.c > > > index 43cc0eb722..58ae65e22b 100644 > > > --- a/target/ppc/pmu_book3s_helper.c > > > +++ b/target/ppc/pmu_book3s_helper.c > > > @@ -25,6 +25,7 @@ > > > * and SPAPR code. > > > */ > > > #define PPC_CPU_FREQ 1000000000 > > > +#define COUNTER_NEGATIVE_VAL 0x80000000 > > > static uint64_t get_cycles(uint64_t icount_delta) > > > { > > > @@ -32,22 +33,9 @@ static uint64_t get_cycles(uint64_t icount_delta) > > > NANOSECONDS_PER_SECOND); > > > } > > > -static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, > > > - uint64_t icount_delta) > > > -{ > > > - env->spr[sprn] += icount_delta; > > > -} > > > - > > > -static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, > > > - uint64_t icount_delta) > > > -{ > > > - env->spr[sprn] += get_cycles(icount_delta); > > > -} > > > - > > > -static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, > > > - uint64_t icount_delta) > > > +static uint8_t get_PMC_event(CPUPPCState *env, int sprn) > > > { > > > - int event; > > > + int event = 0x0; > > > switch (sprn) { > > > case SPR_POWER_PMC1: > > > @@ -65,11 +53,35 @@ static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, > > > case SPR_POWER_PMC4: > > > event = MMCR1_PMC4SEL & env->spr[SPR_POWER_MMCR1]; > > > break; > > > + case SPR_POWER_PMC5: > > > + event = 0x2; > > > + break; > > > + case SPR_POWER_PMC6: > > > + event = 0x1E; > > > + break; > > > > This looks like a nice cleanup that would be better folded into an > > earlier patch. > > > > > default: > > > - return; > > > + break; > > > } > > > - switch (event) { > > > + return event; > > > +} > > > + > > > +static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, > > > + uint64_t icount_delta) > > > +{ > > > + env->spr[sprn] += icount_delta; > > > +} > > > + > > > +static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, > > > + uint64_t icount_delta) > > > +{ > > > + env->spr[sprn] += get_cycles(icount_delta); > > > +} > > > + > > > +static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, > > > + uint64_t icount_delta) > > > +{ > > > + switch (get_PMC_event(env, sprn)) { > > > case 0x2: > > > update_PMC_PM_INST_CMPL(env, sprn, icount_delta); > > > break; > > > @@ -99,30 +111,57 @@ static void update_PMCs(CPUPPCState *env, uint64_t icount_delta) > > > update_PMC_PM_CYC(env, SPR_POWER_PMC6, icount_delta); > > > } > > > +static void set_PMU_excp_timer(CPUPPCState *env) > > > +{ > > > + uint64_t timeout, now, remaining_val; > > > + > > > + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_PMC1CE)) { > > > + return; > > > + } > > > + > > > + remaining_val = COUNTER_NEGATIVE_VAL - env->spr[SPR_POWER_PMC1]; > > > + > > > + switch (get_PMC_event(env, SPR_POWER_PMC1)) { > > > + case 0x2: > > > + timeout = icount_to_ns(remaining_val); > > > + break; > > > + case 0x1e: > > > + timeout = muldiv64(remaining_val, NANOSECONDS_PER_SECOND, > > > + PPC_CPU_FREQ); > > > > So.. this appears to be simulating to the guest that cycles are > > occurring at a constant rate, consistent with the advertised CPU > > frequency. Which sounds right, except... it's not clear to me that > > you're using the same logic to generate the values you read from the > > cycles PMC (in that case it shouldn't need to reference icount at all, > > right?). > > 'remaining_val' meaning depends on the event being sampled in the PMC > in that moment. PMCs 1 to 4 can have a multitude of events, PMC5 is always > count instructions and PMC6 is always counting cycles. > > For 0x02, env->spr[SPR_POWER_PMC1] contains instructions. remaining_val is > the remaining insns for the counter negative condition, and icount_to_ns() > is the timeout estimation for that. The value of the PMC1 will be set > via update_PMC_PM_INST_CMPL(), which in turn is just a matter of summing > the elapsed icount delta between start and freeze into the PMC. > > For 0x1e, env->spr[SPR_POWER_PMC1] contains cycles. remaining_val is > the remaining cycles for counter negative cycles, and this muldiv64 calc > is the timeout estimation in this case. The PMC value is set via > update_PMC_PM_CYC(), which in turn does a math with the current icount > delta in get_cycles(icount_delta) to get the current PMC value. > > All the metrics implemented in this PMU relies on 'icount_delta', the > amount of icount units between the change states of MMCR0_FC (and other > freeze counter bits like patch 17). Ah, sorry, I missed that the PMC value (and therefore remaining val) was based on the icount delta. So.. that's consistent, but what is the justification for using something based on icount for cycles, rather than something based on time?
On 8/11/21 12:40 AM, David Gibson wrote: > On Tue, Aug 10, 2021 at 05:26:09PM -0300, Daniel Henrique Barboza wrote: >> >> >> On 8/10/21 1:01 AM, David Gibson wrote: >>> On Mon, Aug 09, 2021 at 10:10:50AM -0300, Daniel Henrique Barboza wrote: >>>> This patch starts the counter negative EBB support by enabling PMC1 >>>> counter negative condition. >>>> >>>> A counter negative condition happens when a performance monitor counter >>>> reaches the value 0x80000000. When that happens, if a counter negative >>>> condition is enabled in that counter, a performance monitor alert is >>>> triggered. For PMC1, this condition is enabled by MMCR0_PMC1CE. >>>> >>>> An icount-based logic is used to predict when we need to wake up the timer >>>> to trigger the alert in both PM_INST_CMPL (0x2) and PM_CYC (0x1E) events. >>>> The timer callback will then trigger a PPC_INTERRUPT_PMC which will become a >>>> event-based exception later. >>>> >>>> Some EBB powerpc kernel selftests are passing after this patch, but a >>>> substancial amount of them relies on other PMCs to be enabled and events >>>> that we don't support at this moment. We'll address that in the next >>>> patches. >>>> >>>> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> >>>> --- >>>> target/ppc/cpu.h | 1 + >>>> target/ppc/pmu_book3s_helper.c | 127 +++++++++++++++++++++++---------- >>>> 2 files changed, 92 insertions(+), 36 deletions(-) >>>> >>>> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h >>>> index 1d38b8cf7a..5c81d459f4 100644 >>>> --- a/target/ppc/cpu.h >>>> +++ b/target/ppc/cpu.h >>>> @@ -350,6 +350,7 @@ typedef struct ppc_v3_pate_t { >>>> #define MMCR0_EBE PPC_BIT(43) /* Perf Monitor EBB Enable */ >>>> #define MMCR0_FCECE PPC_BIT(38) /* FC on Enabled Cond or Event */ >>>> #define MMCR0_PMCC PPC_BITMASK(44, 45) /* PMC Control */ >>>> +#define MMCR0_PMC1CE PPC_BIT(48) >>>> #define MMCR1_PMC1SEL_SHIFT (63 - 39) >>>> #define MMCR1_PMC1SEL PPC_BITMASK(32, 39) >>>> diff --git a/target/ppc/pmu_book3s_helper.c b/target/ppc/pmu_book3s_helper.c >>>> index 43cc0eb722..58ae65e22b 100644 >>>> --- a/target/ppc/pmu_book3s_helper.c >>>> +++ b/target/ppc/pmu_book3s_helper.c >>>> @@ -25,6 +25,7 @@ >>>> * and SPAPR code. >>>> */ >>>> #define PPC_CPU_FREQ 1000000000 >>>> +#define COUNTER_NEGATIVE_VAL 0x80000000 >>>> static uint64_t get_cycles(uint64_t icount_delta) >>>> { >>>> @@ -32,22 +33,9 @@ static uint64_t get_cycles(uint64_t icount_delta) >>>> NANOSECONDS_PER_SECOND); >>>> } >>>> -static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, >>>> - uint64_t icount_delta) >>>> -{ >>>> - env->spr[sprn] += icount_delta; >>>> -} >>>> - >>>> -static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, >>>> - uint64_t icount_delta) >>>> -{ >>>> - env->spr[sprn] += get_cycles(icount_delta); >>>> -} >>>> - >>>> -static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, >>>> - uint64_t icount_delta) >>>> +static uint8_t get_PMC_event(CPUPPCState *env, int sprn) >>>> { >>>> - int event; >>>> + int event = 0x0; >>>> switch (sprn) { >>>> case SPR_POWER_PMC1: >>>> @@ -65,11 +53,35 @@ static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, >>>> case SPR_POWER_PMC4: >>>> event = MMCR1_PMC4SEL & env->spr[SPR_POWER_MMCR1]; >>>> break; >>>> + case SPR_POWER_PMC5: >>>> + event = 0x2; >>>> + break; >>>> + case SPR_POWER_PMC6: >>>> + event = 0x1E; >>>> + break; >>> >>> This looks like a nice cleanup that would be better folded into an >>> earlier patch. >>> >>>> default: >>>> - return; >>>> + break; >>>> } >>>> - switch (event) { >>>> + return event; >>>> +} >>>> + >>>> +static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, >>>> + uint64_t icount_delta) >>>> +{ >>>> + env->spr[sprn] += icount_delta; >>>> +} >>>> + >>>> +static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, >>>> + uint64_t icount_delta) >>>> +{ >>>> + env->spr[sprn] += get_cycles(icount_delta); >>>> +} >>>> + >>>> +static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, >>>> + uint64_t icount_delta) >>>> +{ >>>> + switch (get_PMC_event(env, sprn)) { >>>> case 0x2: >>>> update_PMC_PM_INST_CMPL(env, sprn, icount_delta); >>>> break; >>>> @@ -99,30 +111,57 @@ static void update_PMCs(CPUPPCState *env, uint64_t icount_delta) >>>> update_PMC_PM_CYC(env, SPR_POWER_PMC6, icount_delta); >>>> } >>>> +static void set_PMU_excp_timer(CPUPPCState *env) >>>> +{ >>>> + uint64_t timeout, now, remaining_val; >>>> + >>>> + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_PMC1CE)) { >>>> + return; >>>> + } >>>> + >>>> + remaining_val = COUNTER_NEGATIVE_VAL - env->spr[SPR_POWER_PMC1]; >>>> + >>>> + switch (get_PMC_event(env, SPR_POWER_PMC1)) { >>>> + case 0x2: >>>> + timeout = icount_to_ns(remaining_val); >>>> + break; >>>> + case 0x1e: >>>> + timeout = muldiv64(remaining_val, NANOSECONDS_PER_SECOND, >>>> + PPC_CPU_FREQ); >>> >>> So.. this appears to be simulating to the guest that cycles are >>> occurring at a constant rate, consistent with the advertised CPU >>> frequency. Which sounds right, except... it's not clear to me that >>> you're using the same logic to generate the values you read from the >>> cycles PMC (in that case it shouldn't need to reference icount at all, >>> right?). >> >> 'remaining_val' meaning depends on the event being sampled in the PMC >> in that moment. PMCs 1 to 4 can have a multitude of events, PMC5 is always >> count instructions and PMC6 is always counting cycles. >> >> For 0x02, env->spr[SPR_POWER_PMC1] contains instructions. remaining_val is >> the remaining insns for the counter negative condition, and icount_to_ns() >> is the timeout estimation for that. The value of the PMC1 will be set >> via update_PMC_PM_INST_CMPL(), which in turn is just a matter of summing >> the elapsed icount delta between start and freeze into the PMC. >> >> For 0x1e, env->spr[SPR_POWER_PMC1] contains cycles. remaining_val is >> the remaining cycles for counter negative cycles, and this muldiv64 calc >> is the timeout estimation in this case. The PMC value is set via >> update_PMC_PM_CYC(), which in turn does a math with the current icount >> delta in get_cycles(icount_delta) to get the current PMC value. >> >> All the metrics implemented in this PMU relies on 'icount_delta', the >> amount of icount units between the change states of MMCR0_FC (and other >> freeze counter bits like patch 17). > > Ah, sorry, I missed that the PMC value (and therefore remaining val) > was based on the icount delta. > > So.. that's consistent, but what is the justification for using > something based on icount for cycles, rather than something based on time? We could calculate the cycles via time granted that the clock we're using is fixed in 1Ghz, so 1ns => 1 cycle. For that, we would need a similar mechanic to what we already do with icount - get a time_base, cycles would be calculated via a time_delta, etc. However, and commenting a bit on Richard's review in patch 08, the cycle calculation we're doing is just returning icount_to_ns(icount_delta) because PPC_CPU_FREQ and NANOSECONDS_PER_SECOND are the same value. So, given that the conditions in which we would need to store and calculate a time delta is the same as what we're already doing with icount today, isn't cycles = icount_to_ns(icount_delta) = time_delta? I mean, nothing is stopping us from calculating cycles using time, but in the end we would do the same thing we're already doing today. Thanks, Daniel >
On Wed, Aug 11, 2021 at 08:18:29AM -0300, Daniel Henrique Barboza wrote: > > > On 8/11/21 12:40 AM, David Gibson wrote: > > On Tue, Aug 10, 2021 at 05:26:09PM -0300, Daniel Henrique Barboza wrote: > > > > > > > > > On 8/10/21 1:01 AM, David Gibson wrote: > > > > On Mon, Aug 09, 2021 at 10:10:50AM -0300, Daniel Henrique Barboza wrote: > > > > > This patch starts the counter negative EBB support by enabling PMC1 > > > > > counter negative condition. > > > > > > > > > > A counter negative condition happens when a performance monitor counter > > > > > reaches the value 0x80000000. When that happens, if a counter negative > > > > > condition is enabled in that counter, a performance monitor alert is > > > > > triggered. For PMC1, this condition is enabled by MMCR0_PMC1CE. > > > > > > > > > > An icount-based logic is used to predict when we need to wake up the timer > > > > > to trigger the alert in both PM_INST_CMPL (0x2) and PM_CYC (0x1E) events. > > > > > The timer callback will then trigger a PPC_INTERRUPT_PMC which will become a > > > > > event-based exception later. > > > > > > > > > > Some EBB powerpc kernel selftests are passing after this patch, but a > > > > > substancial amount of them relies on other PMCs to be enabled and events > > > > > that we don't support at this moment. We'll address that in the next > > > > > patches. > > > > > > > > > > Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> > > > > > --- > > > > > target/ppc/cpu.h | 1 + > > > > > target/ppc/pmu_book3s_helper.c | 127 +++++++++++++++++++++++---------- > > > > > 2 files changed, 92 insertions(+), 36 deletions(-) > > > > > > > > > > diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h > > > > > index 1d38b8cf7a..5c81d459f4 100644 > > > > > --- a/target/ppc/cpu.h > > > > > +++ b/target/ppc/cpu.h > > > > > @@ -350,6 +350,7 @@ typedef struct ppc_v3_pate_t { > > > > > #define MMCR0_EBE PPC_BIT(43) /* Perf Monitor EBB Enable */ > > > > > #define MMCR0_FCECE PPC_BIT(38) /* FC on Enabled Cond or Event */ > > > > > #define MMCR0_PMCC PPC_BITMASK(44, 45) /* PMC Control */ > > > > > +#define MMCR0_PMC1CE PPC_BIT(48) > > > > > #define MMCR1_PMC1SEL_SHIFT (63 - 39) > > > > > #define MMCR1_PMC1SEL PPC_BITMASK(32, 39) > > > > > diff --git a/target/ppc/pmu_book3s_helper.c b/target/ppc/pmu_book3s_helper.c > > > > > index 43cc0eb722..58ae65e22b 100644 > > > > > --- a/target/ppc/pmu_book3s_helper.c > > > > > +++ b/target/ppc/pmu_book3s_helper.c > > > > > @@ -25,6 +25,7 @@ > > > > > * and SPAPR code. > > > > > */ > > > > > #define PPC_CPU_FREQ 1000000000 > > > > > +#define COUNTER_NEGATIVE_VAL 0x80000000 > > > > > static uint64_t get_cycles(uint64_t icount_delta) > > > > > { > > > > > @@ -32,22 +33,9 @@ static uint64_t get_cycles(uint64_t icount_delta) > > > > > NANOSECONDS_PER_SECOND); > > > > > } > > > > > -static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, > > > > > - uint64_t icount_delta) > > > > > -{ > > > > > - env->spr[sprn] += icount_delta; > > > > > -} > > > > > - > > > > > -static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, > > > > > - uint64_t icount_delta) > > > > > -{ > > > > > - env->spr[sprn] += get_cycles(icount_delta); > > > > > -} > > > > > - > > > > > -static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, > > > > > - uint64_t icount_delta) > > > > > +static uint8_t get_PMC_event(CPUPPCState *env, int sprn) > > > > > { > > > > > - int event; > > > > > + int event = 0x0; > > > > > switch (sprn) { > > > > > case SPR_POWER_PMC1: > > > > > @@ -65,11 +53,35 @@ static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, > > > > > case SPR_POWER_PMC4: > > > > > event = MMCR1_PMC4SEL & env->spr[SPR_POWER_MMCR1]; > > > > > break; > > > > > + case SPR_POWER_PMC5: > > > > > + event = 0x2; > > > > > + break; > > > > > + case SPR_POWER_PMC6: > > > > > + event = 0x1E; > > > > > + break; > > > > > > > > This looks like a nice cleanup that would be better folded into an > > > > earlier patch. > > > > > > > > > default: > > > > > - return; > > > > > + break; > > > > > } > > > > > - switch (event) { > > > > > + return event; > > > > > +} > > > > > + > > > > > +static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, > > > > > + uint64_t icount_delta) > > > > > +{ > > > > > + env->spr[sprn] += icount_delta; > > > > > +} > > > > > + > > > > > +static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, > > > > > + uint64_t icount_delta) > > > > > +{ > > > > > + env->spr[sprn] += get_cycles(icount_delta); > > > > > +} > > > > > + > > > > > +static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, > > > > > + uint64_t icount_delta) > > > > > +{ > > > > > + switch (get_PMC_event(env, sprn)) { > > > > > case 0x2: > > > > > update_PMC_PM_INST_CMPL(env, sprn, icount_delta); > > > > > break; > > > > > @@ -99,30 +111,57 @@ static void update_PMCs(CPUPPCState *env, uint64_t icount_delta) > > > > > update_PMC_PM_CYC(env, SPR_POWER_PMC6, icount_delta); > > > > > } > > > > > +static void set_PMU_excp_timer(CPUPPCState *env) > > > > > +{ > > > > > + uint64_t timeout, now, remaining_val; > > > > > + > > > > > + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_PMC1CE)) { > > > > > + return; > > > > > + } > > > > > + > > > > > + remaining_val = COUNTER_NEGATIVE_VAL - env->spr[SPR_POWER_PMC1]; > > > > > + > > > > > + switch (get_PMC_event(env, SPR_POWER_PMC1)) { > > > > > + case 0x2: > > > > > + timeout = icount_to_ns(remaining_val); > > > > > + break; > > > > > + case 0x1e: > > > > > + timeout = muldiv64(remaining_val, NANOSECONDS_PER_SECOND, > > > > > + PPC_CPU_FREQ); > > > > > > > > So.. this appears to be simulating to the guest that cycles are > > > > occurring at a constant rate, consistent with the advertised CPU > > > > frequency. Which sounds right, except... it's not clear to me that > > > > you're using the same logic to generate the values you read from the > > > > cycles PMC (in that case it shouldn't need to reference icount at all, > > > > right?). > > > > > > 'remaining_val' meaning depends on the event being sampled in the PMC > > > in that moment. PMCs 1 to 4 can have a multitude of events, PMC5 is always > > > count instructions and PMC6 is always counting cycles. > > > > > > For 0x02, env->spr[SPR_POWER_PMC1] contains instructions. remaining_val is > > > the remaining insns for the counter negative condition, and icount_to_ns() > > > is the timeout estimation for that. The value of the PMC1 will be set > > > via update_PMC_PM_INST_CMPL(), which in turn is just a matter of summing > > > the elapsed icount delta between start and freeze into the PMC. > > > > > > For 0x1e, env->spr[SPR_POWER_PMC1] contains cycles. remaining_val is > > > the remaining cycles for counter negative cycles, and this muldiv64 calc > > > is the timeout estimation in this case. The PMC value is set via > > > update_PMC_PM_CYC(), which in turn does a math with the current icount > > > delta in get_cycles(icount_delta) to get the current PMC value. > > > > > > All the metrics implemented in this PMU relies on 'icount_delta', the > > > amount of icount units between the change states of MMCR0_FC (and other > > > freeze counter bits like patch 17). > > > > Ah, sorry, I missed that the PMC value (and therefore remaining val) > > was based on the icount delta. > > > > So.. that's consistent, but what is the justification for using > > something based on icount for cycles, rather than something based on time? > > > We could calculate the cycles via time granted that the clock we're using > is fixed in 1Ghz, so 1ns => 1 cycle. For that, we would need a similar mechanic > to what we already do with icount - get a time_base, cycles would be calculated > via a time_delta, etc. > > However, and commenting a bit on Richard's review in patch 08, the cycle > calculation we're doing is just returning icount_to_ns(icount_delta) because > PPC_CPU_FREQ and NANOSECONDS_PER_SECOND are the same value. So, given that the > conditions in which we would need to store and calculate a time delta is the > same as what we're already doing with icount today, isn't > cycles = icount_to_ns(icount_delta) = time_delta? > > I mean, nothing is stopping us from calculating cycles using time, but in the > end we would do the same thing we're already doing today. Oh.. ok. I had assumed that icount worked by instrumenting the generate TCG code to actually count instructions, rather than working off the time.
On 8/11/21 5:39 PM, David Gibson wrote: >> I mean, nothing is stopping us from calculating cycles using time, but in the >> end we would do the same thing we're already doing today. > > Oh.. ok. I had assumed that icount worked by instrumenting the > generate TCG code to actually count instructions, rather than working > off the time. David, you're right, icount instruments the generated tcg code. You also have to add -icount to the command-line. r~
On 8/11/21 6:45 PM, Richard Henderson wrote: > On 8/11/21 5:39 PM, David Gibson wrote: >>> I mean, nothing is stopping us from calculating cycles using time, but in the >>> end we would do the same thing we're already doing today. >> >> Oh.. ok. I had assumed that icount worked by instrumenting the >> generate TCG code to actually count instructions, rather than working >> off the time. > > David, you're right, icount instruments the generated tcg code. > You also have to add -icount to the command-line. Oh, and btw, icount disables multi-threaded tcg, so you're going to be running that guest in round-robin mode. Icount affects so many aspects of qemu that I really do not think it is the best option for a PMU. If you want to count instructions retired, then just do that. Stuff values into tcg_gen_insn_start so they're available for exception unwind, and otherwise decrement the counter at the end of a TB. If you really must interrupt exactly at 0 (and not simply at some point past underflow), then we can adjust the tb lookup logic to let you reduce tb->cflags.CF_COUNT_MASK in cpu_get_tb_cpu_state. r~
On 8/12/21 1:56 AM, Richard Henderson wrote: > On 8/11/21 6:45 PM, Richard Henderson wrote: >> On 8/11/21 5:39 PM, David Gibson wrote: >>>> I mean, nothing is stopping us from calculating cycles using time, but in the >>>> end we would do the same thing we're already doing today. >>> >>> Oh.. ok. I had assumed that icount worked by instrumenting the >>> generate TCG code to actually count instructions, rather than working >>> off the time. >> >> David, you're right, icount instruments the generated tcg code. >> You also have to add -icount to the command-line. > > Oh, and btw, icount disables multi-threaded tcg, so you're going to be running that guest in round-robin mode. > > Icount affects so many aspects of qemu that I really do not think it is the best option for a PMU. Using icount in the PMU is my fallback plan. I spent some time trying to count instructions using translationOps but never got it right. I got up to a point where the tests were counting instructions for the first time it was run in the guest, then nothing else counted in consecutive runs. I was able to figure out that it had to do with how the translation block works. If a previously ran TB was found via lookup then the translationOps callbacks I was using weren't being called. I know that I was missing a piece of info there, but since I'm trying to deal with other aspects of the PMU logic I fell back into using icount to get things of the ground. All this said .... > > If you want to count instructions retired, then just do that. Stuff values into tcg_gen_insn_start so they're available for exception unwind, and otherwise decrement the counter at the end of a TB. ... I don't bother giving this a try. Can you clarify what do you mean with "exception unwind"? I tried something similar with tcg_gen_insn_start() (called via ppc_tr_insn_start()). This this ops is called inside translator_loop(), and translator_loop() isn't being ran if TB_lookup returns the TB, I was observing the behavior I described above of a test counting instructions in its first run only. > > If you really must interrupt exactly at 0 (and not simply at some point past underflow), then we can adjust the tb lookup logic to let you reduce tb->cflags.CF_COUNT_MASK in cpu_get_tb_cpu_state. That would be good, but depending on the amount of work I would consider doing this is a follow-up of this work. It's ok if the PMU overflows a bit instructions for our current purposes ATM. Thanks, Daniel > > > r~
On 8/12/21 7:17 AM, Daniel Henrique Barboza wrote: > > > On 8/12/21 1:56 AM, Richard Henderson wrote: >> On 8/11/21 6:45 PM, Richard Henderson wrote: >>> On 8/11/21 5:39 PM, David Gibson wrote: >>>>> I mean, nothing is stopping us from calculating cycles using time, but in the >>>>> end we would do the same thing we're already doing today. >>>> >>>> Oh.. ok. I had assumed that icount worked by instrumenting the >>>> generate TCG code to actually count instructions, rather than working >>>> off the time. >>> >>> David, you're right, icount instruments the generated tcg code. >>> You also have to add -icount to the command-line. >> >> Oh, and btw, icount disables multi-threaded tcg, so you're going to be running that guest in round-robin mode. >> >> Icount affects so many aspects of qemu that I really do not think it is the best option for a PMU. > > Using icount in the PMU is my fallback plan. I spent some time trying to > count instructions using translationOps but never got it right. I got > up to a point where the tests were counting instructions for the first > time it was run in the guest, then nothing else counted in consecutive > runs. > > I was able to figure out that it had to do with how the translation block > works. If a previously ran TB was found via lookup then the translationOps > callbacks I was using weren't being called. I know that I was missing a > piece of info there, but since I'm trying to deal with other aspects of the > PMU logic I fell back into using icount to get things of the ground. So, I made an attempt to remove all icount() references from the PMU code and instead did the following (posting just the relevant diff): + +void helper_insns_inc(CPUPPCState *env) +{ + env->pmu_insns_count++; +} + +void helper_insns_dec(CPUPPCState *env) +{ + env->pmu_insns_count--; +} diff --git a/target/ppc/translate.c b/target/ppc/translate.c index 60f35360b7..c56c656694 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -8689,6 +8689,7 @@ static void ppc_tr_tb_start(DisasContextBase *db, CPUState *cs) static void ppc_tr_insn_start(DisasContextBase *dcbase, CPUState *cs) { + gen_helper_insns_inc(cpu_env); tcg_gen_insn_start(dcbase->pc_next); } @@ -8755,6 +8756,8 @@ static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs) return; } + gen_helper_insns_dec(cpu_env); + /* Honor single stepping. */ sse = ctx->singlestep_enabled & (CPU_SINGLE_STEP | GDBSTUB_SINGLE_STEP); if (unlikely(sse)) { And then I used 'env->pmu_insns_count' to update the instruction counters. And it's working, with a slightly better performance than we have with the icount() version. I'm not sure why it's working now since I tried something very similar before and it didn't. Figures. It's still overshooting the instructions a bit, but then there's the optimization you mentioned previously with the tb lookup logic that could be done to improve that as well. I'm not sure if this is in line or close with what you proposed, but it's already better than the icount version that I posted here. Thanks, Daniel > > All this said .... > >> >> If you want to count instructions retired, then just do that. Stuff values into tcg_gen_insn_start so they're available for exception unwind, and otherwise decrement the counter at the end of a TB. > > ... I don't bother giving this a try. Can you clarify what do you mean > with "exception unwind"? > > I tried something similar with tcg_gen_insn_start() (called via ppc_tr_insn_start()). > This this ops is called inside translator_loop(), and translator_loop() isn't > being ran if TB_lookup returns the TB, I was observing the behavior I > described above of a test counting instructions in its first run only. > > >> >> If you really must interrupt exactly at 0 (and not simply at some point past underflow), then we can adjust the tb lookup logic to let you reduce tb->cflags.CF_COUNT_MASK in cpu_get_tb_cpu_state. > > That would be good, but depending on the amount of work I would consider doing > this is a follow-up of this work. It's ok if the PMU overflows a bit instructions > for our current purposes ATM. > > > Thanks, > > > Daniel > >> >> >> r~
On 8/12/21 11:24 AM, Daniel Henrique Barboza wrote: > +void helper_insns_inc(CPUPPCState *env) > +{ > + env->pmu_insns_count++; > +} > + > +void helper_insns_dec(CPUPPCState *env) > +{ > + env->pmu_insns_count--; > +} > diff --git a/target/ppc/translate.c b/target/ppc/translate.c > index 60f35360b7..c56c656694 100644 > --- a/target/ppc/translate.c > +++ b/target/ppc/translate.c > @@ -8689,6 +8689,7 @@ static void ppc_tr_tb_start(DisasContextBase *db, CPUState *cs) > > static void ppc_tr_insn_start(DisasContextBase *dcbase, CPUState *cs) > { > + gen_helper_insns_inc(cpu_env); > tcg_gen_insn_start(dcbase->pc_next); > } > > @@ -8755,6 +8756,8 @@ static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs) > return; > } > > + gen_helper_insns_dec(cpu_env); > + > /* Honor single stepping. */ > sse = ctx->singlestep_enabled & (CPU_SINGLE_STEP | GDBSTUB_SINGLE_STEP); > if (unlikely(sse)) { > > > And then I used 'env->pmu_insns_count' to update the instruction counters. And it's > working, with a slightly better performance than we have with the icount() > version. I'm not sure why it's working now since I tried something very similar > before and it didn't. Figures. Not sure why you're decrementing there. Also, how do you want to count retired instructions? Ones that merely start? E.g. including loads that fault? How about illegal instructions? Or does the instruction need to run to completion? Which of these you choose affects where you place the increment. It is of course extremely heavyweight to call a helper to perform a simple addition operation. You may also wish to look at target/hexagon, gen_exec_counters(), which is doing the same sort of thing. But not completely, since it loses count along dynamic exception paths (e.g. faulting load). That can be fixed as well, via restore_state_to_opc. r~
On 8/12/21 9:35 PM, Richard Henderson wrote: > On 8/12/21 11:24 AM, Daniel Henrique Barboza wrote: >> +void helper_insns_inc(CPUPPCState *env) >> +{ >> + env->pmu_insns_count++; >> +} >> + >> +void helper_insns_dec(CPUPPCState *env) >> +{ >> + env->pmu_insns_count--; >> +} >> diff --git a/target/ppc/translate.c b/target/ppc/translate.c >> index 60f35360b7..c56c656694 100644 >> --- a/target/ppc/translate.c >> +++ b/target/ppc/translate.c >> @@ -8689,6 +8689,7 @@ static void ppc_tr_tb_start(DisasContextBase *db, CPUState *cs) >> >> static void ppc_tr_insn_start(DisasContextBase *dcbase, CPUState *cs) >> { >> + gen_helper_insns_inc(cpu_env); >> tcg_gen_insn_start(dcbase->pc_next); >> } >> >> @@ -8755,6 +8756,8 @@ static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs) >> return; >> } >> >> + gen_helper_insns_dec(cpu_env); >> + >> /* Honor single stepping. */ >> sse = ctx->singlestep_enabled & (CPU_SINGLE_STEP | GDBSTUB_SINGLE_STEP); >> if (unlikely(sse)) { >> >> >> And then I used 'env->pmu_insns_count' to update the instruction counters. And it's >> working, with a slightly better performance than we have with the icount() >> version. I'm not sure why it's working now since I tried something very similar >> before and it didn't. Figures. > > Not sure why you're decrementing there. > > Also, how do you want to count retired instructions? Ones that merely start? E.g. including loads that fault? How about illegal instructions? Or does the instruction need to run to completion? Which of these you choose affects where you place the increment. Before I proceed, let me mention that the numbers I'm about to post here are from the powerpc selftest located here: https://github.com/torvalds/linux/blob/master/tools/testing/selftests/powerpc/pmu/count_instructions.c This test runs an instruction loop that consists of 'addi' instructions . Before running the instructions there's an overhead calculation with an empty loop. So, the event I want to count is described as "The thread has completed an instruction" in the ISA. There are no events that counts illegal instructions or that causes fault, so my guess is that all completed functions regardless of side-effects are counted. Putting the incrementer in ppc_tr_insn_start() like I mentioned in my previous reply provides results similar to the icount version: # ./count_instructions test: count_instructions tags: git_version:v5.13-5357-gdbe69e433722-dirty Binding to cpu 0 main test running as pid 568 Overhead of null loop: 2370 instructions instructions: result 1002370 running/enabled 1587314 cycles: result 1001372 running/enabled 1348532 Looped for 1000000 instructions, overhead 2370 Expected 1002370 Actual 1002370 Delta 0, 0.000000% instructions: result 10010309 running/enabled 11603340 cycles: result 10009311 running/enabled 11362216 Looped for 10000000 instructions, overhead 2370 Expected 10002370 Actual 10010309 Delta 7939, 0.079308% [FAIL] Test FAILED on line 119 failure: count_instructions I'm aware that putting the increment in insn_start() is too early since the instruction didn't even complete, so I attempted to put the increment at the end of ppc_tr_translate_insn(). I ended up with fewer instructions counted for the same test: # ./count_instructions test: count_instructions tags: git_version:v5.13-5357-gdbe69e433722-dirty Binding to cpu 0 main test running as pid 575 Overhead of null loop: 1962 instructions instructions: result 939462 running/enabled 1587310 cycles: result 1001372 running/enabled 1348532 Looped for 1000000 instructions, overhead 1962 Expected 1001962 Actual 939462 Delta -62500, -6.652744% [FAIL] Test FAILED on line 116 failure: count_instructions Reading translator_loop() I can't explain why we're missing this amount of functions because translate_insn() is always called right after insns_start(): while (true) { db->num_insns++; /* NOTE: incrementing the counter in insn_start() ppc impl gives similar result as icount*/ ops->insn_start(db, cpu); tcg_debug_assert(db->is_jmp == DISAS_NEXT); /* no early exit */ (...) /* NOTE: incrementing the counter in translate_insn() PPC impl misses 6% of the instructions, even though it's always called right after insn_start() */ if (db->num_insns == db->max_insns && (cflags & CF_LAST_IO)) { /* Accept I/O on the last instruction. */ gen_io_start(); ops->translate_insn(db, cpu); } else { /* we should only see CF_MEMI_ONLY for io_recompile */ tcg_debug_assert(!(cflags & CF_MEMI_ONLY)); ops->translate_insn(db, cpu); } I've recompiled QEMU with --enable-tcg-debug to see if tcg_debug_assert() right after insn_start() was being hit. The assert isn't being hit in the test. I also tried to read the instructions in tb_stop() via db->num_insns. Since I'm not worried about missing the counter_overflow a few instructions it would also be an alternative. I changed the helper to receive a number of instructions to be added instead of single increments: static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs) target_ulong nip = ctx->base.pc_next; int sse; + gen_helper_insns_inc(cpu_env, tcg_constant_i32(dcbase->num_insns)); + if (is_jmp == DISAS_NORETURN) { /* We have already exited the TB. */ return; The test gives us these numbers: # ./count_instructions test: count_instructions tags: git_version:v5.13-5357-gdbe69e433722-dirty Binding to cpu 0 main test running as pid 573 Overhead of null loop: 85 instructions instructions: result 85 running/enabled 1587316 cycles: result 1001372 running/enabled 1348534 Looped for 1000000 instructions, overhead 85 Expected 1000085 Actual 85 Delta -1000000, -1176470.588235% [FAIL] Test FAILED on line 116 failure: count_instructions This is the situation I've faced some time ago and commented on a previous email. My hypothesis back then was that translation_loop(), which is called via gen_intermediate_code() - tb_gen_code(), was not being executed because tb_gen_code() is always guarded by a tb_lookup() , and assumed that the lookup was finding the already translated TB and cpu_tb_exec() with it. Well, we just saw that incrementing the counter in both insn_start() and translate_insn() gives a far greater number than trying to sum up all the db->num_insns in tb_stop(), and all of them are called inside translator_loop(). This invalidates (I guess) the assumptions I made on tb_gen_code() up above. So yeah, I need more time to understand why I'm getting these numbers. For now, I'm really tempted into increment the instructions at insn_start() and come back at instruction counting specifics in a follow-up work. > > It is of course extremely heavyweight to call a helper to perform a simple addition operation. This increment would update all registers that are counting instructions and also fire a PMU interrupt when an overflow happens. I also added the frozen count PMU bit in 'hflags' so we can increment only if the PMU is running. There's a good chance that most of this can be done outside the helper in a TCG function in translate.c. That would minimize the impact of the running PMU in the translation process. I'll see how it goes. > > You may also wish to look at target/hexagon, gen_exec_counters(), which is doing the same sort of thing. But not completely, since it loses count along dynamic exception paths (e.g. faulting load). That can be fixed as well, via restore_state_to_opc. I saw the hexagon code and attempted to accumulate the instructions in a 'num_insn' ctx variable, that was incremented in insns_start(), and then update the PMU counters in tb_stop() with it. The results were similar to what I've described above with db->num_insns in tb_stop(). Thanks, Daniel > > > r~
On 8/14/21 9:13 AM, Daniel Henrique Barboza wrote: > https://github.com/torvalds/linux/blob/master/tools/testing/selftests/powerpc/pmu/count_instructions.c > > > This test runs an instruction loop that consists of 'addi' instructions . Before running > the instructions > there's an overhead calculation with an empty loop. ... > static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs) > target_ulong nip = ctx->base.pc_next; > int sse; > > + gen_helper_insns_inc(cpu_env, tcg_constant_i32(dcbase->num_insns)); > + > if (is_jmp == DISAS_NORETURN) { > /* We have already exited the TB. */ > return; You've not considered how branches are implemented. We generate code to exit the tb in gen_bcond. Anything you emit after that is dead code. r~
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 1d38b8cf7a..5c81d459f4 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -350,6 +350,7 @@ typedef struct ppc_v3_pate_t { #define MMCR0_EBE PPC_BIT(43) /* Perf Monitor EBB Enable */ #define MMCR0_FCECE PPC_BIT(38) /* FC on Enabled Cond or Event */ #define MMCR0_PMCC PPC_BITMASK(44, 45) /* PMC Control */ +#define MMCR0_PMC1CE PPC_BIT(48) #define MMCR1_PMC1SEL_SHIFT (63 - 39) #define MMCR1_PMC1SEL PPC_BITMASK(32, 39) diff --git a/target/ppc/pmu_book3s_helper.c b/target/ppc/pmu_book3s_helper.c index 43cc0eb722..58ae65e22b 100644 --- a/target/ppc/pmu_book3s_helper.c +++ b/target/ppc/pmu_book3s_helper.c @@ -25,6 +25,7 @@ * and SPAPR code. */ #define PPC_CPU_FREQ 1000000000 +#define COUNTER_NEGATIVE_VAL 0x80000000 static uint64_t get_cycles(uint64_t icount_delta) { @@ -32,22 +33,9 @@ static uint64_t get_cycles(uint64_t icount_delta) NANOSECONDS_PER_SECOND); } -static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, - uint64_t icount_delta) -{ - env->spr[sprn] += icount_delta; -} - -static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, - uint64_t icount_delta) -{ - env->spr[sprn] += get_cycles(icount_delta); -} - -static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, - uint64_t icount_delta) +static uint8_t get_PMC_event(CPUPPCState *env, int sprn) { - int event; + int event = 0x0; switch (sprn) { case SPR_POWER_PMC1: @@ -65,11 +53,35 @@ static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, case SPR_POWER_PMC4: event = MMCR1_PMC4SEL & env->spr[SPR_POWER_MMCR1]; break; + case SPR_POWER_PMC5: + event = 0x2; + break; + case SPR_POWER_PMC6: + event = 0x1E; + break; default: - return; + break; } - switch (event) { + return event; +} + +static void update_PMC_PM_INST_CMPL(CPUPPCState *env, int sprn, + uint64_t icount_delta) +{ + env->spr[sprn] += icount_delta; +} + +static void update_PMC_PM_CYC(CPUPPCState *env, int sprn, + uint64_t icount_delta) +{ + env->spr[sprn] += get_cycles(icount_delta); +} + +static void update_programmable_PMC_reg(CPUPPCState *env, int sprn, + uint64_t icount_delta) +{ + switch (get_PMC_event(env, sprn)) { case 0x2: update_PMC_PM_INST_CMPL(env, sprn, icount_delta); break; @@ -99,30 +111,57 @@ static void update_PMCs(CPUPPCState *env, uint64_t icount_delta) update_PMC_PM_CYC(env, SPR_POWER_PMC6, icount_delta); } +static void set_PMU_excp_timer(CPUPPCState *env) +{ + uint64_t timeout, now, remaining_val; + + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_PMC1CE)) { + return; + } + + remaining_val = COUNTER_NEGATIVE_VAL - env->spr[SPR_POWER_PMC1]; + + switch (get_PMC_event(env, SPR_POWER_PMC1)) { + case 0x2: + timeout = icount_to_ns(remaining_val); + break; + case 0x1e: + timeout = muldiv64(remaining_val, NANOSECONDS_PER_SECOND, + PPC_CPU_FREQ); + break; + default: + return; + } + + now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + + timer_mod(env->pmu_intr_timer, now + timeout); +} + static void cpu_ppc_pmu_timer_cb(void *opaque) { PowerPCCPU *cpu = opaque; CPUPPCState *env = &cpu->env; - uint64_t mmcr0; - - mmcr0 = env->spr[SPR_POWER_MMCR0]; - if (env->spr[SPR_POWER_MMCR0] & MMCR0_EBE) { - /* freeeze counters if needed */ - if (mmcr0 & MMCR0_FCECE) { - mmcr0 &= ~MMCR0_FCECE; - mmcr0 |= MMCR0_FC; - } + uint64_t icount_delta = (uint64_t)icount_get_raw() - env->pmu_base_icount; - /* Clear PMAE and set PMAO */ - if (mmcr0 & MMCR0_PMAE) { - mmcr0 &= ~MMCR0_PMAE; - mmcr0 |= MMCR0_PMAO; - } - env->spr[SPR_POWER_MMCR0] = mmcr0; + if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_EBE)) { + return; + } + + update_PMCs(env, icount_delta); + + if (env->spr[SPR_POWER_MMCR0] & MMCR0_FCECE) { + env->spr[SPR_POWER_MMCR0] &= ~MMCR0_FCECE; + env->spr[SPR_POWER_MMCR0] |= MMCR0_FC; + } - /* Fire the PMC hardware exception */ - ppc_set_irq(cpu, PPC_INTERRUPT_PMC, 1); + if (env->spr[SPR_POWER_MMCR0] & MMCR0_PMAE) { + env->spr[SPR_POWER_MMCR0] &= ~MMCR0_PMAE; + env->spr[SPR_POWER_MMCR0] |= MMCR0_PMAO; } + + /* Fire the PMC hardware exception */ + ppc_set_irq(cpu, PPC_INTERRUPT_PMC, 1); } void cpu_ppc_pmu_timer_init(CPUPPCState *env) @@ -134,12 +173,19 @@ void cpu_ppc_pmu_timer_init(CPUPPCState *env) env->pmu_intr_timer = timer; } +static bool mmcr0_counter_neg_cond_enabled(uint64_t mmcr0) +{ + return mmcr0 & MMCR0_PMC1CE; +} + void helper_store_mmcr0(CPUPPCState *env, target_ulong value) { uint64_t curr_icount = (uint64_t)icount_get_raw(); bool curr_FC = env->spr[SPR_POWER_MMCR0] & MMCR0_FC; bool new_FC = value & MMCR0_FC; + env->spr[SPR_POWER_MMCR0] = value; + /* * In an frozen count (FC) bit change: * @@ -163,10 +209,19 @@ void helper_store_mmcr0(CPUPPCState *env, target_ulong value) * until the freeze. */ update_PMCs(env, icount_delta); + + /* delete pending timer */ + timer_del(env->pmu_intr_timer); } else { env->pmu_base_icount = curr_icount; + + /* + * Start performance monitor alert timer for counter negative + * events, if needed. + */ + if (mmcr0_counter_neg_cond_enabled(env->spr[SPR_POWER_MMCR0])) { + set_PMU_excp_timer(env); + } } } - - env->spr[SPR_POWER_MMCR0] = value; }
This patch starts the counter negative EBB support by enabling PMC1 counter negative condition. A counter negative condition happens when a performance monitor counter reaches the value 0x80000000. When that happens, if a counter negative condition is enabled in that counter, a performance monitor alert is triggered. For PMC1, this condition is enabled by MMCR0_PMC1CE. An icount-based logic is used to predict when we need to wake up the timer to trigger the alert in both PM_INST_CMPL (0x2) and PM_CYC (0x1E) events. The timer callback will then trigger a PPC_INTERRUPT_PMC which will become a event-based exception later. Some EBB powerpc kernel selftests are passing after this patch, but a substancial amount of them relies on other PMCs to be enabled and events that we don't support at this moment. We'll address that in the next patches. Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com> --- target/ppc/cpu.h | 1 + target/ppc/pmu_book3s_helper.c | 127 +++++++++++++++++++++++---------- 2 files changed, 92 insertions(+), 36 deletions(-)