diff mbox series

[RFC,8/9] RISC-V: KVM: Implement perf support

Message ID 20220718170205.2972215-9-atishp@rivosinc.com (mailing list archive)
State New, archived
Headers show
Series KVM perf support | expand

Commit Message

Atish Patra July 18, 2022, 5:02 p.m. UTC
RISC-V SBI PMU & Sscofpmf ISA extension allows supporting perf in
the virtualization enviornment as well. KVM implementation
relies on SBI PMU extension for most of the part while traps
& emulates the CSRs read for counter access.

Signed-off-by: Atish Patra <atishp@rivosinc.com>
---
 arch/riscv/kvm/vcpu_pmu.c | 318 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 301 insertions(+), 17 deletions(-)

Comments

Eric Lin Sept. 20, 2022, 2:24 a.m. UTC | #1
Hi Atish,

On Tue, Jul 19, 2022 at 2:01 AM Atish Patra <atishp@rivosinc.com> wrote:
>
> RISC-V SBI PMU & Sscofpmf ISA extension allows supporting perf in
> the virtualization enviornment as well. KVM implementation
> relies on SBI PMU extension for most of the part while traps
> & emulates the CSRs read for counter access.
>
> Signed-off-by: Atish Patra <atishp@rivosinc.com>
> ---
>  arch/riscv/kvm/vcpu_pmu.c | 318 ++++++++++++++++++++++++++++++++++++--
>  1 file changed, 301 insertions(+), 17 deletions(-)
>
> diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
> index 5434051f495d..278c261efad3 100644
> --- a/arch/riscv/kvm/vcpu_pmu.c
> +++ b/arch/riscv/kvm/vcpu_pmu.c
> @@ -11,9 +11,163 @@
>  #include <linux/kvm_host.h>
>  #include <linux/perf/riscv_pmu.h>
>  #include <asm/csr.h>
> +#include <asm/bitops.h>
>  #include <asm/kvm_vcpu_pmu.h>
>  #include <linux/kvm_host.h>
>
> +#define get_event_type(x) ((x & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
> +#define get_event_code(x) (x & SBI_PMU_EVENT_IDX_CODE_MASK)
> +
> +static inline u64 pmu_get_sample_period(struct kvm_pmc *pmc)
> +{
> +       u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
> +       u64 sample_period;
> +
> +       if (!pmc->counter_val)
> +               sample_period = counter_val_mask;
> +       else
> +               sample_period = pmc->counter_val & counter_val_mask;

I think sample_period should be =>
sample_period = (-pmc->counter_val) & counter_val_mask

When we are doing event counting, the pmu counter initial value comes
from the guest kernel is 0x800000000000000X.
If we let the sample period be the (pmc->counter_val) &
counter_val_mask, the sample_period will be 0x800000000000000X.
After we pass this sample_period to the host pmu driver, in
riscv_pmu_event_set_period(), it will make the final pmu counter
initial value be 0xffffffffffXX.
This will make the pmu counter overflow interrupt frequently and
trigger soft lockup in kvm guest.

I also checked the arm64 kvm perf implementation as below, its
sample_period is attr.sample_period = (-counter) & GENMASK(63, 0)

 624 static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64
select_idx)
 625 {
....
 688                 /* The initial sample period (overflow count) of
an event. */
 689                 if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
 690                         attr.sample_period = (-counter) & GENMASK(63, 0);
 691                 else
 692                         attr.sample_period = (-counter) & GENMASK(31, 0);

After I apply the patch as below, no occur counter overflow interrupt
and the pmu counter initial value is the same as we do event counting
in the host.

--- a/arch/riscv/kvm/vcpu_pmu.c
+++ b/arch/riscv/kvm/vcpu_pmu.c
@@ -26,7 +26,7 @@ static inline u64 pmu_get_sample_period(struct kvm_pmc *pmc)
        if (!pmc->counter_val)
                sample_period = counter_val_mask;
        else
-               sample_period = pmc->counter_val & counter_val_mask;
+               sample_period = (-pmc->counter_val) & counter_val_mask;

Thanks,
Eric Lin

> +
> +       return sample_period;
> +}
> +
> +static u32 pmu_get_perf_event_type(unsigned long eidx)
> +{
> +       enum sbi_pmu_event_type etype = get_event_type(eidx);
> +       u32 type;
> +
> +       if (etype == SBI_PMU_EVENT_TYPE_HW)
> +               type = PERF_TYPE_HARDWARE;
> +       else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
> +               type = PERF_TYPE_HW_CACHE;
> +       else if (etype == SBI_PMU_EVENT_TYPE_RAW || etype == SBI_PMU_EVENT_TYPE_FW)
> +               type = PERF_TYPE_RAW;
> +       else
> +               type = PERF_TYPE_MAX;
> +
> +       return type;
> +}
> +
> +static inline bool pmu_is_fw_event(unsigned long eidx)
> +{
> +       enum sbi_pmu_event_type etype = get_event_type(eidx);
> +
> +       return (etype == SBI_PMU_EVENT_TYPE_FW) ? true : false;
> +}
> +
> +static void pmu_release_perf_event(struct kvm_pmc *pmc)
> +{
> +       if (pmc->perf_event) {
> +               perf_event_disable(pmc->perf_event);
> +               perf_event_release_kernel(pmc->perf_event);
> +               pmc->perf_event = NULL;
> +       }
> +}
> +
> +static u64 pmu_get_perf_event_hw_config(u32 sbi_event_code)
> +{
> +       /* SBI PMU HW event code is offset by 1 from perf hw event codes */
> +       return (u64)sbi_event_code - 1;
> +}
> +
> +static u64 pmu_get_perf_event_cache_config(u32 sbi_event_code)
> +{
> +       u64 config = U64_MAX;
> +       unsigned int cache_type, cache_op, cache_result;
> +
> +       /* All the cache event masks lie within 0xFF. No separate masking is necesssary */
> +       cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >> 3;
> +       cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >> 1;
> +       cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
> +
> +       if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
> +           cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
> +           cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
> +               goto out;
> +       config = cache_type | (cache_op << 8) | (cache_result << 16);
> +out:
> +       return config;
> +}
> +
> +static u64 pmu_get_perf_event_config(unsigned long eidx, uint64_t edata)
> +{
> +       enum sbi_pmu_event_type etype = get_event_type(eidx);
> +       u32 ecode = get_event_code(eidx);
> +       u64 config = U64_MAX;
> +
> +       if (etype == SBI_PMU_EVENT_TYPE_HW)
> +               config = pmu_get_perf_event_hw_config(ecode);
> +       else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
> +               config = pmu_get_perf_event_cache_config(ecode);
> +       else if (etype == SBI_PMU_EVENT_TYPE_RAW)
> +               config = edata & RISCV_PMU_RAW_EVENT_MASK;
> +       else if ((etype == SBI_PMU_EVENT_TYPE_FW) && (ecode < SBI_PMU_FW_MAX))
> +               config = (1ULL << 63) | ecode;
> +
> +       return config;
> +}
> +
> +static int pmu_get_fixed_pmc_index(unsigned long eidx)
> +{
> +       u32 etype = pmu_get_perf_event_type(eidx);
> +       u32 ecode = get_event_code(eidx);
> +       int ctr_idx;
> +
> +       if (etype != SBI_PMU_EVENT_TYPE_HW)
> +               return -EINVAL;
> +
> +       if (ecode == SBI_PMU_HW_CPU_CYCLES)
> +               ctr_idx = 0;
> +       else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
> +               ctr_idx = 2;
> +       else
> +               return -EINVAL;
> +
> +       return ctr_idx;
> +}
> +
> +static int pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
> +                                         unsigned long cbase, unsigned long cmask)
> +{
> +       int ctr_idx = -1;
> +       int i, pmc_idx;
> +       int min, max;
> +
> +       if (pmu_is_fw_event(eidx)) {
> +               /* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
> +               min = kvpmu->num_hw_ctrs;
> +               max = min + kvpmu->num_fw_ctrs;
> +       } else {
> +               /* First 3 counters are reserved for fixed counters */
> +               min = 3;
> +               max = kvpmu->num_hw_ctrs;
> +       }
> +
> +       for_each_set_bit(i, &cmask, BITS_PER_LONG) {
> +               pmc_idx = i + cbase;
> +               if ((pmc_idx >= min && pmc_idx < max) &&
> +                   !test_bit(pmc_idx, kvpmu->used_pmc)) {
> +                       ctr_idx = pmc_idx;
> +                       break;
> +               }
> +       }
> +
> +       return ctr_idx;
> +}
> +
> +static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
> +                            unsigned long cbase, unsigned long cmask)
> +{
> +       int ret;
> +
> +       /* Fixed counters need to be have fixed mapping as they have different width */
> +       ret = pmu_get_fixed_pmc_index(eidx);
> +       if (ret >= 0)
> +               return ret;
> +
> +       return pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
> +}
> +
>  int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
>                                 unsigned long *out_val)
>  {
> @@ -43,7 +197,6 @@ int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
>
>         if (!kvpmu)
>                 return KVM_INSN_EXIT_TO_USER_SPACE;
> -       //TODO: Should we check if vcpu pmu is initialized or not!
>         if (wr_mask)
>                 return KVM_INSN_ILLEGAL_TRAP;
>         cidx = csr_num - CSR_CYCLE;
> @@ -81,14 +234,62 @@ int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
>  int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
>                                  unsigned long ctr_mask, unsigned long flag, uint64_t ival)
>  {
> -       /* TODO */
> +       struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +       int i, num_ctrs, pmc_index;
> +       struct kvm_pmc *pmc;
> +
> +       num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> +       if (ctr_base + __fls(ctr_mask) >= num_ctrs)
> +               return -EINVAL;
> +
> +       /* Start the counters that have been configured and requested by the guest */
> +       for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
> +               pmc_index = i + ctr_base;
> +               if (!test_bit(pmc_index, kvpmu->used_pmc))
> +                       continue;
> +               pmc = &kvpmu->pmc[pmc_index];
> +               if (flag & SBI_PMU_START_FLAG_SET_INIT_VALUE)
> +                       pmc->counter_val = ival;
> +               if (pmc->perf_event) {
> +                       perf_event_period(pmc->perf_event, pmu_get_sample_period(pmc));
> +                       perf_event_enable(pmc->perf_event);
> +               }
> +       }
> +
>         return 0;
>  }
>
>  int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
>                                  unsigned long ctr_mask, unsigned long flag)
>  {
> -       /* TODO */
> +       struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +       int i, num_ctrs, pmc_index;
> +       u64 enabled, running;
> +       struct kvm_pmc *pmc;
> +
> +       num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> +       if ((ctr_base + __fls(ctr_mask)) >= num_ctrs)
> +               return -EINVAL;
> +
> +       /* Stop the counters that have been configured and requested by the guest */
> +       for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
> +               pmc_index = i + ctr_base;
> +               if (!test_bit(pmc_index, kvpmu->used_pmc))
> +                       continue;
> +               pmc = &kvpmu->pmc[pmc_index];
> +               if (pmc->perf_event) {
> +                       /* Stop counting the counter */
> +                       perf_event_disable(pmc->perf_event);
> +                       if (flag & SBI_PMU_STOP_FLAG_RESET) {
> +                               /* Relase the counter if this is a reset request */
> +                               pmc->counter_val += perf_event_read_value(pmc->perf_event,
> +                                                                         &enabled, &running);
> +                               pmu_release_perf_event(pmc);
> +                               clear_bit(pmc_index, kvpmu->used_pmc);
> +                       }
> +               }
> +       }
> +
>         return 0;
>  }
>
> @@ -96,14 +297,85 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba
>                                      unsigned long ctr_mask, unsigned long flag,
>                                      unsigned long eidx, uint64_t edata)
>  {
> -       /* TODO */
> -       return 0;
> +       struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +       struct perf_event *event;
> +       struct perf_event_attr attr;
> +       int num_ctrs, ctr_idx;
> +       u32 etype = pmu_get_perf_event_type(eidx);
> +       u64 config;
> +       struct kvm_pmc *pmc;
> +
> +       num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> +       if ((etype == PERF_TYPE_MAX) || ((ctr_base + __fls(ctr_mask)) >= num_ctrs))
> +               return -EINVAL;
> +
> +       if (pmu_is_fw_event(eidx))
> +               return -EOPNOTSUPP;
> +       /*
> +        * SKIP_MATCH flag indicates the caller is aware of the assigned counter
> +        * for this event. Just do a sanity check if it already marked used.
> +        */
> +       if (flag & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
> +               if (!test_bit(ctr_base, kvpmu->used_pmc))
> +                       return -EINVAL;
> +               ctr_idx = ctr_base;
> +               goto match_done;
> +       }
> +
> +       ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
> +       if (ctr_idx < 0)
> +               return -EOPNOTSUPP;
> +
> +match_done:
> +       pmc = &kvpmu->pmc[ctr_idx];
> +       pmu_release_perf_event(pmc);
> +       pmc->idx = ctr_idx;
> +
> +       config = pmu_get_perf_event_config(eidx, edata);
> +       memset(&attr, 0, sizeof(struct perf_event_attr));
> +       attr.type = etype;
> +       attr.size = sizeof(attr);
> +       attr.pinned = true;
> +
> +       /*
> +        * It should never reach here if the platform doesn't support sscofpmf extensio
> +        * as mode filtering won't work without it.
> +        */
> +       attr.exclude_host = true;
> +       attr.exclude_hv = true;
> +       attr.exclude_user = flag & SBI_PMU_CFG_FLAG_SET_UINH ? 1 : 0;
> +       attr.exclude_kernel = flag & SBI_PMU_CFG_FLAG_SET_SINH ? 1 : 0;
> +       attr.config = config;
> +       attr.config1 = RISCV_KVM_PMU_CONFIG1_GUEST_EVENTS;
> +       if (flag & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
> +               //TODO: Do we really want to clear the value in hardware counter
> +               pmc->counter_val = 0;
> +       }
> +       /*
> +        * Set the default sample_period for now. The guest specified value
> +        * will be updated in the start call.
> +        */
> +       attr.sample_period = pmu_get_sample_period(pmc);
> +
> +       event = perf_event_create_kernel_counter(&attr, -1, current, NULL, pmc);
> +       if (IS_ERR(event)) {
> +               pr_err("kvm pmu event creation failed event %pe for eidx %lx\n", event, eidx);
> +               return -EOPNOTSUPP;
> +       }
> +
> +       set_bit(ctr_idx, kvpmu->used_pmc);
> +       pmc->perf_event = event;
> +       if (flag & SBI_PMU_CFG_FLAG_AUTO_START)
> +               perf_event_enable(pmc->perf_event);
> +
> +       return ctr_idx;
>  }
>
>  int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
>  {
>         int i = 0, num_hw_ctrs, num_fw_ctrs, hpm_width;
>         struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +       struct kvm_pmc *pmc;
>
>         if (!kvpmu)
>                 return -EINVAL;
> @@ -120,6 +392,7 @@ int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
>                 return -EINVAL;
>         }
>
> +       bitmap_zero(kvpmu->used_pmc, RISCV_MAX_COUNTERS);
>         kvpmu->num_hw_ctrs = num_hw_ctrs;
>         kvpmu->num_fw_ctrs = num_fw_ctrs;
>         /*
> @@ -132,38 +405,49 @@ int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
>                 /* TIME CSR shouldn't be read from perf interface */
>                 if (i == 1)
>                         continue;
> -               kvpmu->pmc[i].idx = i;
> -               kvpmu->pmc[i].vcpu = vcpu;
> +               pmc = &kvpmu->pmc[i];
> +               pmc->idx = i;
> +               pmc->counter_val = 0;
> +               pmc->vcpu = vcpu;
>                 if (i < kvpmu->num_hw_ctrs) {
>                         kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_HW;
>                         if (i < 3)
>                                 /* CY, IR counters */
> -                               kvpmu->pmc[i].cinfo.width = 63;
> +                               pmc->cinfo.width = 63;
>                         else
> -                               kvpmu->pmc[i].cinfo.width = hpm_width;
> +                               pmc->cinfo.width = hpm_width;
>                         /*
>                          * The CSR number doesn't have any relation with the logical
>                          * hardware counters. The CSR numbers are encoded sequentially
>                          * to avoid maintaining a map between the virtual counter
>                          * and CSR number.
>                          */
> -                       kvpmu->pmc[i].cinfo.csr = CSR_CYCLE + i;
> +                       pmc->cinfo.csr = CSR_CYCLE + i;
>                 } else {
> -                       kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_FW;
> -                       kvpmu->pmc[i].cinfo.width = BITS_PER_LONG - 1;
> +                       pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
> +                       pmc->cinfo.width = BITS_PER_LONG - 1;
>                 }
>         }
>
>         return 0;
>  }
>
> -void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
> +void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
>  {
> -       /* TODO */
> +       struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +       struct kvm_pmc *pmc;
> +       int i;
> +
> +       if (!kvpmu)
> +               return;
> +
> +       for_each_set_bit(i, kvpmu->used_pmc, RISCV_MAX_COUNTERS) {
> +               pmc = &kvpmu->pmc[i];
> +               pmu_release_perf_event(pmc);
> +       }
>  }
>
> -void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
> +void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
>  {
> -       /* TODO */
> +       kvm_riscv_vcpu_pmu_deinit(vcpu);
>  }
> -
> --
> 2.25.1
>
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Atish Patra Sept. 23, 2022, 9:04 p.m. UTC | #2
On Mon, Sep 19, 2022 at 7:24 PM Eric Lin <eric.lin@sifive.com> wrote:
>
> Hi Atish,
>
> On Tue, Jul 19, 2022 at 2:01 AM Atish Patra <atishp@rivosinc.com> wrote:
> >
> > RISC-V SBI PMU & Sscofpmf ISA extension allows supporting perf in
> > the virtualization enviornment as well. KVM implementation
> > relies on SBI PMU extension for most of the part while traps
> > & emulates the CSRs read for counter access.
> >
> > Signed-off-by: Atish Patra <atishp@rivosinc.com>
> > ---
> >  arch/riscv/kvm/vcpu_pmu.c | 318 ++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 301 insertions(+), 17 deletions(-)
> >
> > diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
> > index 5434051f495d..278c261efad3 100644
> > --- a/arch/riscv/kvm/vcpu_pmu.c
> > +++ b/arch/riscv/kvm/vcpu_pmu.c
> > @@ -11,9 +11,163 @@
> >  #include <linux/kvm_host.h>
> >  #include <linux/perf/riscv_pmu.h>
> >  #include <asm/csr.h>
> > +#include <asm/bitops.h>
> >  #include <asm/kvm_vcpu_pmu.h>
> >  #include <linux/kvm_host.h>
> >
> > +#define get_event_type(x) ((x & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
> > +#define get_event_code(x) (x & SBI_PMU_EVENT_IDX_CODE_MASK)
> > +
> > +static inline u64 pmu_get_sample_period(struct kvm_pmc *pmc)
> > +{
> > +       u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
> > +       u64 sample_period;
> > +
> > +       if (!pmc->counter_val)
> > +               sample_period = counter_val_mask;
> > +       else
> > +               sample_period = pmc->counter_val & counter_val_mask;
>
> I think sample_period should be =>
> sample_period = (-pmc->counter_val) & counter_val_mask
>
> When we are doing event counting, the pmu counter initial value comes
> from the guest kernel is 0x800000000000000X.
> If we let the sample period be the (pmc->counter_val) &
> counter_val_mask, the sample_period will be 0x800000000000000X.
> After we pass this sample_period to the host pmu driver, in
> riscv_pmu_event_set_period(), it will make the final pmu counter
> initial value be 0xffffffffffXX.
> This will make the pmu counter overflow interrupt frequently and
> trigger soft lockup in kvm guest.
>
> I also checked the arm64 kvm perf implementation as below, its
> sample_period is attr.sample_period = (-counter) & GENMASK(63, 0)
>
>  624 static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64
> select_idx)
>  625 {
> ....
>  688                 /* The initial sample period (overflow count) of
> an event. */
>  689                 if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
>  690                         attr.sample_period = (-counter) & GENMASK(63, 0);
>  691                 else
>  692                         attr.sample_period = (-counter) & GENMASK(31, 0);
>
> After I apply the patch as below, no occur counter overflow interrupt
> and the pmu counter initial value is the same as we do event counting
> in the host.
>
> --- a/arch/riscv/kvm/vcpu_pmu.c
> +++ b/arch/riscv/kvm/vcpu_pmu.c
> @@ -26,7 +26,7 @@ static inline u64 pmu_get_sample_period(struct kvm_pmc *pmc)
>         if (!pmc->counter_val)
>                 sample_period = counter_val_mask;
>         else
> -               sample_period = pmc->counter_val & counter_val_mask;
> +               sample_period = (-pmc->counter_val) & counter_val_mask;
>

Yes. Thanks for the catch.
The sample_period should be (-pmc->counter_val) & counter_val_mask.

I will revise the patch along with a few other fixes and send the v2.

> Thanks,
> Eric Lin
>
> > +
> > +       return sample_period;
> > +}
> > +
> > +static u32 pmu_get_perf_event_type(unsigned long eidx)
> > +{
> > +       enum sbi_pmu_event_type etype = get_event_type(eidx);
> > +       u32 type;
> > +
> > +       if (etype == SBI_PMU_EVENT_TYPE_HW)
> > +               type = PERF_TYPE_HARDWARE;
> > +       else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
> > +               type = PERF_TYPE_HW_CACHE;
> > +       else if (etype == SBI_PMU_EVENT_TYPE_RAW || etype == SBI_PMU_EVENT_TYPE_FW)
> > +               type = PERF_TYPE_RAW;
> > +       else
> > +               type = PERF_TYPE_MAX;
> > +
> > +       return type;
> > +}
> > +
> > +static inline bool pmu_is_fw_event(unsigned long eidx)
> > +{
> > +       enum sbi_pmu_event_type etype = get_event_type(eidx);
> > +
> > +       return (etype == SBI_PMU_EVENT_TYPE_FW) ? true : false;
> > +}
> > +
> > +static void pmu_release_perf_event(struct kvm_pmc *pmc)
> > +{
> > +       if (pmc->perf_event) {
> > +               perf_event_disable(pmc->perf_event);
> > +               perf_event_release_kernel(pmc->perf_event);
> > +               pmc->perf_event = NULL;
> > +       }
> > +}
> > +
> > +static u64 pmu_get_perf_event_hw_config(u32 sbi_event_code)
> > +{
> > +       /* SBI PMU HW event code is offset by 1 from perf hw event codes */
> > +       return (u64)sbi_event_code - 1;
> > +}
> > +
> > +static u64 pmu_get_perf_event_cache_config(u32 sbi_event_code)
> > +{
> > +       u64 config = U64_MAX;
> > +       unsigned int cache_type, cache_op, cache_result;
> > +
> > +       /* All the cache event masks lie within 0xFF. No separate masking is necesssary */
> > +       cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >> 3;
> > +       cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >> 1;
> > +       cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
> > +
> > +       if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
> > +           cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
> > +           cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
> > +               goto out;
> > +       config = cache_type | (cache_op << 8) | (cache_result << 16);
> > +out:
> > +       return config;
> > +}
> > +
> > +static u64 pmu_get_perf_event_config(unsigned long eidx, uint64_t edata)
> > +{
> > +       enum sbi_pmu_event_type etype = get_event_type(eidx);
> > +       u32 ecode = get_event_code(eidx);
> > +       u64 config = U64_MAX;
> > +
> > +       if (etype == SBI_PMU_EVENT_TYPE_HW)
> > +               config = pmu_get_perf_event_hw_config(ecode);
> > +       else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
> > +               config = pmu_get_perf_event_cache_config(ecode);
> > +       else if (etype == SBI_PMU_EVENT_TYPE_RAW)
> > +               config = edata & RISCV_PMU_RAW_EVENT_MASK;
> > +       else if ((etype == SBI_PMU_EVENT_TYPE_FW) && (ecode < SBI_PMU_FW_MAX))
> > +               config = (1ULL << 63) | ecode;
> > +
> > +       return config;
> > +}
> > +
> > +static int pmu_get_fixed_pmc_index(unsigned long eidx)
> > +{
> > +       u32 etype = pmu_get_perf_event_type(eidx);
> > +       u32 ecode = get_event_code(eidx);
> > +       int ctr_idx;
> > +
> > +       if (etype != SBI_PMU_EVENT_TYPE_HW)
> > +               return -EINVAL;
> > +
> > +       if (ecode == SBI_PMU_HW_CPU_CYCLES)
> > +               ctr_idx = 0;
> > +       else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
> > +               ctr_idx = 2;
> > +       else
> > +               return -EINVAL;
> > +
> > +       return ctr_idx;
> > +}
> > +
> > +static int pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
> > +                                         unsigned long cbase, unsigned long cmask)
> > +{
> > +       int ctr_idx = -1;
> > +       int i, pmc_idx;
> > +       int min, max;
> > +
> > +       if (pmu_is_fw_event(eidx)) {
> > +               /* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
> > +               min = kvpmu->num_hw_ctrs;
> > +               max = min + kvpmu->num_fw_ctrs;
> > +       } else {
> > +               /* First 3 counters are reserved for fixed counters */
> > +               min = 3;
> > +               max = kvpmu->num_hw_ctrs;
> > +       }
> > +
> > +       for_each_set_bit(i, &cmask, BITS_PER_LONG) {
> > +               pmc_idx = i + cbase;
> > +               if ((pmc_idx >= min && pmc_idx < max) &&
> > +                   !test_bit(pmc_idx, kvpmu->used_pmc)) {
> > +                       ctr_idx = pmc_idx;
> > +                       break;
> > +               }
> > +       }
> > +
> > +       return ctr_idx;
> > +}
> > +
> > +static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
> > +                            unsigned long cbase, unsigned long cmask)
> > +{
> > +       int ret;
> > +
> > +       /* Fixed counters need to be have fixed mapping as they have different width */
> > +       ret = pmu_get_fixed_pmc_index(eidx);
> > +       if (ret >= 0)
> > +               return ret;
> > +
> > +       return pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
> > +}
> > +
> >  int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
> >                                 unsigned long *out_val)
> >  {
> > @@ -43,7 +197,6 @@ int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
> >
> >         if (!kvpmu)
> >                 return KVM_INSN_EXIT_TO_USER_SPACE;
> > -       //TODO: Should we check if vcpu pmu is initialized or not!
> >         if (wr_mask)
> >                 return KVM_INSN_ILLEGAL_TRAP;
> >         cidx = csr_num - CSR_CYCLE;
> > @@ -81,14 +234,62 @@ int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
> >  int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
> >                                  unsigned long ctr_mask, unsigned long flag, uint64_t ival)
> >  {
> > -       /* TODO */
> > +       struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +       int i, num_ctrs, pmc_index;
> > +       struct kvm_pmc *pmc;
> > +
> > +       num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> > +       if (ctr_base + __fls(ctr_mask) >= num_ctrs)
> > +               return -EINVAL;
> > +
> > +       /* Start the counters that have been configured and requested by the guest */
> > +       for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
> > +               pmc_index = i + ctr_base;
> > +               if (!test_bit(pmc_index, kvpmu->used_pmc))
> > +                       continue;
> > +               pmc = &kvpmu->pmc[pmc_index];
> > +               if (flag & SBI_PMU_START_FLAG_SET_INIT_VALUE)
> > +                       pmc->counter_val = ival;
> > +               if (pmc->perf_event) {
> > +                       perf_event_period(pmc->perf_event, pmu_get_sample_period(pmc));
> > +                       perf_event_enable(pmc->perf_event);
> > +               }
> > +       }
> > +
> >         return 0;
> >  }
> >
> >  int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
> >                                  unsigned long ctr_mask, unsigned long flag)
> >  {
> > -       /* TODO */
> > +       struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +       int i, num_ctrs, pmc_index;
> > +       u64 enabled, running;
> > +       struct kvm_pmc *pmc;
> > +
> > +       num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> > +       if ((ctr_base + __fls(ctr_mask)) >= num_ctrs)
> > +               return -EINVAL;
> > +
> > +       /* Stop the counters that have been configured and requested by the guest */
> > +       for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
> > +               pmc_index = i + ctr_base;
> > +               if (!test_bit(pmc_index, kvpmu->used_pmc))
> > +                       continue;
> > +               pmc = &kvpmu->pmc[pmc_index];
> > +               if (pmc->perf_event) {
> > +                       /* Stop counting the counter */
> > +                       perf_event_disable(pmc->perf_event);
> > +                       if (flag & SBI_PMU_STOP_FLAG_RESET) {
> > +                               /* Relase the counter if this is a reset request */
> > +                               pmc->counter_val += perf_event_read_value(pmc->perf_event,
> > +                                                                         &enabled, &running);
> > +                               pmu_release_perf_event(pmc);
> > +                               clear_bit(pmc_index, kvpmu->used_pmc);
> > +                       }
> > +               }
> > +       }
> > +
> >         return 0;
> >  }
> >
> > @@ -96,14 +297,85 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba
> >                                      unsigned long ctr_mask, unsigned long flag,
> >                                      unsigned long eidx, uint64_t edata)
> >  {
> > -       /* TODO */
> > -       return 0;
> > +       struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +       struct perf_event *event;
> > +       struct perf_event_attr attr;
> > +       int num_ctrs, ctr_idx;
> > +       u32 etype = pmu_get_perf_event_type(eidx);
> > +       u64 config;
> > +       struct kvm_pmc *pmc;
> > +
> > +       num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> > +       if ((etype == PERF_TYPE_MAX) || ((ctr_base + __fls(ctr_mask)) >= num_ctrs))
> > +               return -EINVAL;
> > +
> > +       if (pmu_is_fw_event(eidx))
> > +               return -EOPNOTSUPP;
> > +       /*
> > +        * SKIP_MATCH flag indicates the caller is aware of the assigned counter
> > +        * for this event. Just do a sanity check if it already marked used.
> > +        */
> > +       if (flag & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
> > +               if (!test_bit(ctr_base, kvpmu->used_pmc))
> > +                       return -EINVAL;
> > +               ctr_idx = ctr_base;
> > +               goto match_done;
> > +       }
> > +
> > +       ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
> > +       if (ctr_idx < 0)
> > +               return -EOPNOTSUPP;
> > +
> > +match_done:
> > +       pmc = &kvpmu->pmc[ctr_idx];
> > +       pmu_release_perf_event(pmc);
> > +       pmc->idx = ctr_idx;
> > +
> > +       config = pmu_get_perf_event_config(eidx, edata);
> > +       memset(&attr, 0, sizeof(struct perf_event_attr));
> > +       attr.type = etype;
> > +       attr.size = sizeof(attr);
> > +       attr.pinned = true;
> > +
> > +       /*
> > +        * It should never reach here if the platform doesn't support sscofpmf extensio
> > +        * as mode filtering won't work without it.
> > +        */
> > +       attr.exclude_host = true;
> > +       attr.exclude_hv = true;
> > +       attr.exclude_user = flag & SBI_PMU_CFG_FLAG_SET_UINH ? 1 : 0;
> > +       attr.exclude_kernel = flag & SBI_PMU_CFG_FLAG_SET_SINH ? 1 : 0;
> > +       attr.config = config;
> > +       attr.config1 = RISCV_KVM_PMU_CONFIG1_GUEST_EVENTS;
> > +       if (flag & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
> > +               //TODO: Do we really want to clear the value in hardware counter
> > +               pmc->counter_val = 0;
> > +       }
> > +       /*
> > +        * Set the default sample_period for now. The guest specified value
> > +        * will be updated in the start call.
> > +        */
> > +       attr.sample_period = pmu_get_sample_period(pmc);
> > +
> > +       event = perf_event_create_kernel_counter(&attr, -1, current, NULL, pmc);
> > +       if (IS_ERR(event)) {
> > +               pr_err("kvm pmu event creation failed event %pe for eidx %lx\n", event, eidx);
> > +               return -EOPNOTSUPP;
> > +       }
> > +
> > +       set_bit(ctr_idx, kvpmu->used_pmc);
> > +       pmc->perf_event = event;
> > +       if (flag & SBI_PMU_CFG_FLAG_AUTO_START)
> > +               perf_event_enable(pmc->perf_event);
> > +
> > +       return ctr_idx;
> >  }
> >
> >  int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
> >  {
> >         int i = 0, num_hw_ctrs, num_fw_ctrs, hpm_width;
> >         struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +       struct kvm_pmc *pmc;
> >
> >         if (!kvpmu)
> >                 return -EINVAL;
> > @@ -120,6 +392,7 @@ int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
> >                 return -EINVAL;
> >         }
> >
> > +       bitmap_zero(kvpmu->used_pmc, RISCV_MAX_COUNTERS);
> >         kvpmu->num_hw_ctrs = num_hw_ctrs;
> >         kvpmu->num_fw_ctrs = num_fw_ctrs;
> >         /*
> > @@ -132,38 +405,49 @@ int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
> >                 /* TIME CSR shouldn't be read from perf interface */
> >                 if (i == 1)
> >                         continue;
> > -               kvpmu->pmc[i].idx = i;
> > -               kvpmu->pmc[i].vcpu = vcpu;
> > +               pmc = &kvpmu->pmc[i];
> > +               pmc->idx = i;
> > +               pmc->counter_val = 0;
> > +               pmc->vcpu = vcpu;
> >                 if (i < kvpmu->num_hw_ctrs) {
> >                         kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_HW;
> >                         if (i < 3)
> >                                 /* CY, IR counters */
> > -                               kvpmu->pmc[i].cinfo.width = 63;
> > +                               pmc->cinfo.width = 63;
> >                         else
> > -                               kvpmu->pmc[i].cinfo.width = hpm_width;
> > +                               pmc->cinfo.width = hpm_width;
> >                         /*
> >                          * The CSR number doesn't have any relation with the logical
> >                          * hardware counters. The CSR numbers are encoded sequentially
> >                          * to avoid maintaining a map between the virtual counter
> >                          * and CSR number.
> >                          */
> > -                       kvpmu->pmc[i].cinfo.csr = CSR_CYCLE + i;
> > +                       pmc->cinfo.csr = CSR_CYCLE + i;
> >                 } else {
> > -                       kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_FW;
> > -                       kvpmu->pmc[i].cinfo.width = BITS_PER_LONG - 1;
> > +                       pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
> > +                       pmc->cinfo.width = BITS_PER_LONG - 1;
> >                 }
> >         }
> >
> >         return 0;
> >  }
> >
> > -void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
> > +void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
> >  {
> > -       /* TODO */
> > +       struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +       struct kvm_pmc *pmc;
> > +       int i;
> > +
> > +       if (!kvpmu)
> > +               return;
> > +
> > +       for_each_set_bit(i, kvpmu->used_pmc, RISCV_MAX_COUNTERS) {
> > +               pmc = &kvpmu->pmc[i];
> > +               pmu_release_perf_event(pmc);
> > +       }
> >  }
> >
> > -void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
> > +void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
> >  {
> > -       /* TODO */
> > +       kvm_riscv_vcpu_pmu_deinit(vcpu);
> >  }
> > -
> > --
> > 2.25.1
> >
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
Andrew Jones Nov. 1, 2022, 3:31 p.m. UTC | #3
On Mon, Jul 18, 2022 at 10:02:04AM -0700, Atish Patra wrote:
> RISC-V SBI PMU & Sscofpmf ISA extension allows supporting perf in
> the virtualization enviornment as well. KVM implementation
> relies on SBI PMU extension for most of the part while traps

...relies on the SBI PMU extension for the most part while trapping
and emulating...

> & emulates the CSRs read for counter access.
> 
> Signed-off-by: Atish Patra <atishp@rivosinc.com>
> ---
>  arch/riscv/kvm/vcpu_pmu.c | 318 ++++++++++++++++++++++++++++++++++++--
>  1 file changed, 301 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
> index 5434051f495d..278c261efad3 100644
> --- a/arch/riscv/kvm/vcpu_pmu.c
> +++ b/arch/riscv/kvm/vcpu_pmu.c
> @@ -11,9 +11,163 @@
>  #include <linux/kvm_host.h>
>  #include <linux/perf/riscv_pmu.h>
>  #include <asm/csr.h>
> +#include <asm/bitops.h>
>  #include <asm/kvm_vcpu_pmu.h>
>  #include <linux/kvm_host.h>
>  
> +#define get_event_type(x) ((x & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
> +#define get_event_code(x) (x & SBI_PMU_EVENT_IDX_CODE_MASK)
> +
> +static inline u64 pmu_get_sample_period(struct kvm_pmc *pmc)
> +{
> +	u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
> +	u64 sample_period;
> +
> +	if (!pmc->counter_val)
> +		sample_period = counter_val_mask;
> +	else
> +		sample_period = pmc->counter_val & counter_val_mask;
> +
> +	return sample_period;
> +}
> +
> +static u32 pmu_get_perf_event_type(unsigned long eidx)
> +{
> +	enum sbi_pmu_event_type etype = get_event_type(eidx);
> +	u32 type;
> +
> +	if (etype == SBI_PMU_EVENT_TYPE_HW)
> +		type = PERF_TYPE_HARDWARE;
> +	else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
> +		type = PERF_TYPE_HW_CACHE;
> +	else if (etype == SBI_PMU_EVENT_TYPE_RAW || etype == SBI_PMU_EVENT_TYPE_FW)
> +		type = PERF_TYPE_RAW;
> +	else
> +		type = PERF_TYPE_MAX;
> +
> +	return type;
> +}
> +
> +static inline bool pmu_is_fw_event(unsigned long eidx)
> +{
> +	enum sbi_pmu_event_type etype = get_event_type(eidx);
> +
> +	return (etype == SBI_PMU_EVENT_TYPE_FW) ? true : false;

 return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW;

> +}
> +
> +static void pmu_release_perf_event(struct kvm_pmc *pmc)
> +{
> +	if (pmc->perf_event) {
> +		perf_event_disable(pmc->perf_event);
> +		perf_event_release_kernel(pmc->perf_event);
> +		pmc->perf_event = NULL;
> +	}
> +}
> +
> +static u64 pmu_get_perf_event_hw_config(u32 sbi_event_code)
> +{
> +	/* SBI PMU HW event code is offset by 1 from perf hw event codes */
> +	return (u64)sbi_event_code - 1;
> +}
> +
> +static u64 pmu_get_perf_event_cache_config(u32 sbi_event_code)
> +{
> +	u64 config = U64_MAX;
> +	unsigned int cache_type, cache_op, cache_result;
> +
> +	/* All the cache event masks lie within 0xFF. No separate masking is necesssary */
> +	cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >> 3;
> +	cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >> 1;
> +	cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
> +
> +	if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
> +	    cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
> +	    cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
> +		goto out;

No goto necessary

        if (...)
                return U64_MAX;
        return cache_type | (cache_op << 8) | (cache_result << 16);

> +	config = cache_type | (cache_op << 8) | (cache_result << 16);
> +out:
> +	return config;
> +}
> +
> +static u64 pmu_get_perf_event_config(unsigned long eidx, uint64_t edata)
> +{
> +	enum sbi_pmu_event_type etype = get_event_type(eidx);
> +	u32 ecode = get_event_code(eidx);
> +	u64 config = U64_MAX;
> +
> +	if (etype == SBI_PMU_EVENT_TYPE_HW)
> +		config = pmu_get_perf_event_hw_config(ecode);
> +	else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
> +		config = pmu_get_perf_event_cache_config(ecode);
> +	else if (etype == SBI_PMU_EVENT_TYPE_RAW)
> +		config = edata & RISCV_PMU_RAW_EVENT_MASK;
> +	else if ((etype == SBI_PMU_EVENT_TYPE_FW) && (ecode < SBI_PMU_FW_MAX))
> +		config = (1ULL << 63) | ecode;
> +
> +	return config;
> +}
> +
> +static int pmu_get_fixed_pmc_index(unsigned long eidx)
> +{
> +	u32 etype = pmu_get_perf_event_type(eidx);
> +	u32 ecode = get_event_code(eidx);
> +	int ctr_idx;
> +
> +	if (etype != SBI_PMU_EVENT_TYPE_HW)
> +		return -EINVAL;
> +
> +	if (ecode == SBI_PMU_HW_CPU_CYCLES)
> +		ctr_idx = 0;
> +	else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
> +		ctr_idx = 2;
> +	else
> +		return -EINVAL;
> +
> +	return ctr_idx;
> +}
> +
> +static int pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
> +					  unsigned long cbase, unsigned long cmask)
> +{
> +	int ctr_idx = -1;
> +	int i, pmc_idx;
> +	int min, max;
> +
> +	if (pmu_is_fw_event(eidx)) {
> +		/* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
> +		min = kvpmu->num_hw_ctrs;
> +		max = min + kvpmu->num_fw_ctrs;
> +	} else {
> +		/* First 3 counters are reserved for fixed counters */
> +		min = 3;
> +		max = kvpmu->num_hw_ctrs;
> +	}
> +
> +	for_each_set_bit(i, &cmask, BITS_PER_LONG) {
> +		pmc_idx = i + cbase;
> +		if ((pmc_idx >= min && pmc_idx < max) &&
> +		    !test_bit(pmc_idx, kvpmu->used_pmc)) {
> +			ctr_idx = pmc_idx;
> +			break;
> +		}
> +	}
> +
> +	return ctr_idx;
> +}
> +
> +static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
> +			     unsigned long cbase, unsigned long cmask)
> +{
> +	int ret;
> +
> +	/* Fixed counters need to be have fixed mapping as they have different width */
> +	ret = pmu_get_fixed_pmc_index(eidx);
> +	if (ret >= 0)
> +		return ret;
> +
> +	return pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
> +}
> +
>  int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
>  				unsigned long *out_val)
>  {
> @@ -43,7 +197,6 @@ int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
>  
>  	if (!kvpmu)
>  		return KVM_INSN_EXIT_TO_USER_SPACE;
> -	//TODO: Should we check if vcpu pmu is initialized or not!
>  	if (wr_mask)
>  		return KVM_INSN_ILLEGAL_TRAP;
>  	cidx = csr_num - CSR_CYCLE;
> @@ -81,14 +234,62 @@ int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
>  int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
>  				 unsigned long ctr_mask, unsigned long flag, uint64_t ival)
>  {
> -	/* TODO */
> +	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +	int i, num_ctrs, pmc_index;
> +	struct kvm_pmc *pmc;
> +
> +	num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> +	if (ctr_base + __fls(ctr_mask) >= num_ctrs)
> +		return -EINVAL;
> +
> +	/* Start the counters that have been configured and requested by the guest */
> +	for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
> +		pmc_index = i + ctr_base;
> +		if (!test_bit(pmc_index, kvpmu->used_pmc))
> +			continue;
> +		pmc = &kvpmu->pmc[pmc_index];
> +		if (flag & SBI_PMU_START_FLAG_SET_INIT_VALUE)
> +			pmc->counter_val = ival;
> +		if (pmc->perf_event) {
> +			perf_event_period(pmc->perf_event, pmu_get_sample_period(pmc));
> +			perf_event_enable(pmc->perf_event);
> +		}

What about checking the "SBI_ERR_ALREADY_STARTED" condition?

> +	}
> +
>  	return 0;
>  }
>  
>  int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
>  				 unsigned long ctr_mask, unsigned long flag)
>  {
> -	/* TODO */
> +	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +	int i, num_ctrs, pmc_index;
> +	u64 enabled, running;
> +	struct kvm_pmc *pmc;
> +
> +	num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> +	if ((ctr_base + __fls(ctr_mask)) >= num_ctrs)
> +		return -EINVAL;
> +
> +	/* Stop the counters that have been configured and requested by the guest */
> +	for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
> +		pmc_index = i + ctr_base;
> +		if (!test_bit(pmc_index, kvpmu->used_pmc))
> +			continue;
> +		pmc = &kvpmu->pmc[pmc_index];
> +		if (pmc->perf_event) {
> +			/* Stop counting the counter */
> +			perf_event_disable(pmc->perf_event);
> +			if (flag & SBI_PMU_STOP_FLAG_RESET) {
> +				/* Relase the counter if this is a reset request */
> +				pmc->counter_val += perf_event_read_value(pmc->perf_event,
> +									  &enabled, &running);
> +				pmu_release_perf_event(pmc);
> +				clear_bit(pmc_index, kvpmu->used_pmc);
> +			}

What about checking the "SBI_ERR_ALREADY_STOPPED" condition?

> +		}
> +	}
> +
>  	return 0;
>  }
>  
> @@ -96,14 +297,85 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba
>  				     unsigned long ctr_mask, unsigned long flag,
>  				     unsigned long eidx, uint64_t edata)
>  {
> -	/* TODO */
> -	return 0;
> +	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +	struct perf_event *event;
> +	struct perf_event_attr attr;
> +	int num_ctrs, ctr_idx;
> +	u32 etype = pmu_get_perf_event_type(eidx);
> +	u64 config;
> +	struct kvm_pmc *pmc;
> +
> +	num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> +	if ((etype == PERF_TYPE_MAX) || ((ctr_base + __fls(ctr_mask)) >= num_ctrs))

nit: Unnecessary ()

> +		return -EINVAL;
> +
> +	if (pmu_is_fw_event(eidx))
> +		return -EOPNOTSUPP;

nit: need blank line here

> +	/*
> +	 * SKIP_MATCH flag indicates the caller is aware of the assigned counter
> +	 * for this event. Just do a sanity check if it already marked used.
> +	 */
> +	if (flag & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
> +		if (!test_bit(ctr_base, kvpmu->used_pmc))
> +			return -EINVAL;
> +		ctr_idx = ctr_base;
> +		goto match_done;
> +	}
> +
> +	ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
> +	if (ctr_idx < 0)
> +		return -EOPNOTSUPP;
> +
> +match_done:
> +	pmc = &kvpmu->pmc[ctr_idx];
> +	pmu_release_perf_event(pmc);
> +	pmc->idx = ctr_idx;
> +
> +	config = pmu_get_perf_event_config(eidx, edata);
> +	memset(&attr, 0, sizeof(struct perf_event_attr));
> +	attr.type = etype;
> +	attr.size = sizeof(attr);
> +	attr.pinned = true;
> +
> +	/*
> +	 * It should never reach here if the platform doesn't support sscofpmf extensio
> +	 * as mode filtering won't work without it.
> +	 */
> +	attr.exclude_host = true;
> +	attr.exclude_hv = true;
> +	attr.exclude_user = flag & SBI_PMU_CFG_FLAG_SET_UINH ? 1 : 0;
> +	attr.exclude_kernel = flag & SBI_PMU_CFG_FLAG_SET_SINH ? 1 : 0;

nit: can use !!(flag & SBI_PMU_CFG_FLAG_SET_UINH)

> +	attr.config = config;
> +	attr.config1 = RISCV_KVM_PMU_CONFIG1_GUEST_EVENTS;
> +	if (flag & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
> +		//TODO: Do we really want to clear the value in hardware counter
> +		pmc->counter_val = 0;
> +	}

nit: add blank line

> +	/*
> +	 * Set the default sample_period for now. The guest specified value
> +	 * will be updated in the start call.
> +	 */
> +	attr.sample_period = pmu_get_sample_period(pmc);
> +
> +	event = perf_event_create_kernel_counter(&attr, -1, current, NULL, pmc);
> +	if (IS_ERR(event)) {
> +		pr_err("kvm pmu event creation failed event %pe for eidx %lx\n", event, eidx);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	set_bit(ctr_idx, kvpmu->used_pmc);
> +	pmc->perf_event = event;
> +	if (flag & SBI_PMU_CFG_FLAG_AUTO_START)
> +		perf_event_enable(pmc->perf_event);
> +
> +	return ctr_idx;
>  }
>  
>  int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
>  {
>  	int i = 0, num_hw_ctrs, num_fw_ctrs, hpm_width;
>  	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +	struct kvm_pmc *pmc;
>  
>  	if (!kvpmu)
>  		return -EINVAL;
> @@ -120,6 +392,7 @@ int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
>  		return -EINVAL;
>  	}
>  
> +	bitmap_zero(kvpmu->used_pmc, RISCV_MAX_COUNTERS);
>  	kvpmu->num_hw_ctrs = num_hw_ctrs;
>  	kvpmu->num_fw_ctrs = num_fw_ctrs;
>  	/*
> @@ -132,38 +405,49 @@ int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
>  		/* TIME CSR shouldn't be read from perf interface */
>  		if (i == 1)
>  			continue;
> -		kvpmu->pmc[i].idx = i;
> -		kvpmu->pmc[i].vcpu = vcpu;
> +		pmc = &kvpmu->pmc[i];

Better to introduce this with the patch that introduced this loop.

> +		pmc->idx = i;
> +		pmc->counter_val = 0;
> +		pmc->vcpu = vcpu;
>  		if (i < kvpmu->num_hw_ctrs) {
>  			kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_HW;
>  			if (i < 3)
>  				/* CY, IR counters */
> -				kvpmu->pmc[i].cinfo.width = 63;
> +				pmc->cinfo.width = 63;
>  			else
> -				kvpmu->pmc[i].cinfo.width = hpm_width;
> +				pmc->cinfo.width = hpm_width;
>  			/*
>  			 * The CSR number doesn't have any relation with the logical
>  			 * hardware counters. The CSR numbers are encoded sequentially
>  			 * to avoid maintaining a map between the virtual counter
>  			 * and CSR number.
>  			 */
> -			kvpmu->pmc[i].cinfo.csr = CSR_CYCLE + i;
> +			pmc->cinfo.csr = CSR_CYCLE + i;
>  		} else {
> -			kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_FW;
> -			kvpmu->pmc[i].cinfo.width = BITS_PER_LONG - 1;
> +			pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
> +			pmc->cinfo.width = BITS_PER_LONG - 1;
>  		}
>  	}
>  
>  	return 0;
>  }
>  
> -void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
> +void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
>  {
> -	/* TODO */
> +	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> +	struct kvm_pmc *pmc;
> +	int i;
> +
> +	if (!kvpmu)
> +		return;
> +
> +	for_each_set_bit(i, kvpmu->used_pmc, RISCV_MAX_COUNTERS) {
> +		pmc = &kvpmu->pmc[i];
> +		pmu_release_perf_event(pmc);
> +	}
>  }
>  
> -void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
> +void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
>  {
> -	/* TODO */
> +	kvm_riscv_vcpu_pmu_deinit(vcpu);
>  }
> -
> -- 
> 2.25.1
>

Thanks,
drew
Atish Patra Nov. 23, 2022, 12:45 a.m. UTC | #4
On Tue, Nov 1, 2022 at 8:31 AM Andrew Jones <ajones@ventanamicro.com> wrote:
>
> On Mon, Jul 18, 2022 at 10:02:04AM -0700, Atish Patra wrote:
> > RISC-V SBI PMU & Sscofpmf ISA extension allows supporting perf in
> > the virtualization enviornment as well. KVM implementation
> > relies on SBI PMU extension for most of the part while traps
>
> ...relies on the SBI PMU extension for the most part while trapping
> and emulating...
>
> > & emulates the CSRs read for counter access.
> >
> > Signed-off-by: Atish Patra <atishp@rivosinc.com>
> > ---
> >  arch/riscv/kvm/vcpu_pmu.c | 318 ++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 301 insertions(+), 17 deletions(-)
> >
> > diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
> > index 5434051f495d..278c261efad3 100644
> > --- a/arch/riscv/kvm/vcpu_pmu.c
> > +++ b/arch/riscv/kvm/vcpu_pmu.c
> > @@ -11,9 +11,163 @@
> >  #include <linux/kvm_host.h>
> >  #include <linux/perf/riscv_pmu.h>
> >  #include <asm/csr.h>
> > +#include <asm/bitops.h>
> >  #include <asm/kvm_vcpu_pmu.h>
> >  #include <linux/kvm_host.h>
> >
> > +#define get_event_type(x) ((x & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
> > +#define get_event_code(x) (x & SBI_PMU_EVENT_IDX_CODE_MASK)
> > +
> > +static inline u64 pmu_get_sample_period(struct kvm_pmc *pmc)
> > +{
> > +     u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
> > +     u64 sample_period;
> > +
> > +     if (!pmc->counter_val)
> > +             sample_period = counter_val_mask;
> > +     else
> > +             sample_period = pmc->counter_val & counter_val_mask;
> > +
> > +     return sample_period;
> > +}
> > +
> > +static u32 pmu_get_perf_event_type(unsigned long eidx)
> > +{
> > +     enum sbi_pmu_event_type etype = get_event_type(eidx);
> > +     u32 type;
> > +
> > +     if (etype == SBI_PMU_EVENT_TYPE_HW)
> > +             type = PERF_TYPE_HARDWARE;
> > +     else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
> > +             type = PERF_TYPE_HW_CACHE;
> > +     else if (etype == SBI_PMU_EVENT_TYPE_RAW || etype == SBI_PMU_EVENT_TYPE_FW)
> > +             type = PERF_TYPE_RAW;
> > +     else
> > +             type = PERF_TYPE_MAX;
> > +
> > +     return type;
> > +}
> > +
> > +static inline bool pmu_is_fw_event(unsigned long eidx)
> > +{
> > +     enum sbi_pmu_event_type etype = get_event_type(eidx);
> > +
> > +     return (etype == SBI_PMU_EVENT_TYPE_FW) ? true : false;
>
>  return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW;
>

Done.

> > +}
> > +
> > +static void pmu_release_perf_event(struct kvm_pmc *pmc)
> > +{
> > +     if (pmc->perf_event) {
> > +             perf_event_disable(pmc->perf_event);
> > +             perf_event_release_kernel(pmc->perf_event);
> > +             pmc->perf_event = NULL;
> > +     }
> > +}
> > +
> > +static u64 pmu_get_perf_event_hw_config(u32 sbi_event_code)
> > +{
> > +     /* SBI PMU HW event code is offset by 1 from perf hw event codes */
> > +     return (u64)sbi_event_code - 1;
> > +}
> > +
> > +static u64 pmu_get_perf_event_cache_config(u32 sbi_event_code)
> > +{
> > +     u64 config = U64_MAX;
> > +     unsigned int cache_type, cache_op, cache_result;
> > +
> > +     /* All the cache event masks lie within 0xFF. No separate masking is necesssary */
> > +     cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >> 3;
> > +     cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >> 1;
> > +     cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
> > +
> > +     if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
> > +         cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
> > +         cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
> > +             goto out;
>
> No goto necessary
>

Done.

>         if (...)
>                 return U64_MAX;
>         return cache_type | (cache_op << 8) | (cache_result << 16);
>
> > +     config = cache_type | (cache_op << 8) | (cache_result << 16);
> > +out:
> > +     return config;
> > +}
> > +
> > +static u64 pmu_get_perf_event_config(unsigned long eidx, uint64_t edata)
> > +{
> > +     enum sbi_pmu_event_type etype = get_event_type(eidx);
> > +     u32 ecode = get_event_code(eidx);
> > +     u64 config = U64_MAX;
> > +
> > +     if (etype == SBI_PMU_EVENT_TYPE_HW)
> > +             config = pmu_get_perf_event_hw_config(ecode);
> > +     else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
> > +             config = pmu_get_perf_event_cache_config(ecode);
> > +     else if (etype == SBI_PMU_EVENT_TYPE_RAW)
> > +             config = edata & RISCV_PMU_RAW_EVENT_MASK;
> > +     else if ((etype == SBI_PMU_EVENT_TYPE_FW) && (ecode < SBI_PMU_FW_MAX))
> > +             config = (1ULL << 63) | ecode;
> > +
> > +     return config;
> > +}
> > +
> > +static int pmu_get_fixed_pmc_index(unsigned long eidx)
> > +{
> > +     u32 etype = pmu_get_perf_event_type(eidx);
> > +     u32 ecode = get_event_code(eidx);
> > +     int ctr_idx;
> > +
> > +     if (etype != SBI_PMU_EVENT_TYPE_HW)
> > +             return -EINVAL;
> > +
> > +     if (ecode == SBI_PMU_HW_CPU_CYCLES)
> > +             ctr_idx = 0;
> > +     else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
> > +             ctr_idx = 2;
> > +     else
> > +             return -EINVAL;
> > +
> > +     return ctr_idx;
> > +}
> > +
> > +static int pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
> > +                                       unsigned long cbase, unsigned long cmask)
> > +{
> > +     int ctr_idx = -1;
> > +     int i, pmc_idx;
> > +     int min, max;
> > +
> > +     if (pmu_is_fw_event(eidx)) {
> > +             /* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
> > +             min = kvpmu->num_hw_ctrs;
> > +             max = min + kvpmu->num_fw_ctrs;
> > +     } else {
> > +             /* First 3 counters are reserved for fixed counters */
> > +             min = 3;
> > +             max = kvpmu->num_hw_ctrs;
> > +     }
> > +
> > +     for_each_set_bit(i, &cmask, BITS_PER_LONG) {
> > +             pmc_idx = i + cbase;
> > +             if ((pmc_idx >= min && pmc_idx < max) &&
> > +                 !test_bit(pmc_idx, kvpmu->used_pmc)) {
> > +                     ctr_idx = pmc_idx;
> > +                     break;
> > +             }
> > +     }
> > +
> > +     return ctr_idx;
> > +}
> > +
> > +static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
> > +                          unsigned long cbase, unsigned long cmask)
> > +{
> > +     int ret;
> > +
> > +     /* Fixed counters need to be have fixed mapping as they have different width */
> > +     ret = pmu_get_fixed_pmc_index(eidx);
> > +     if (ret >= 0)
> > +             return ret;
> > +
> > +     return pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
> > +}
> > +
> >  int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
> >                               unsigned long *out_val)
> >  {
> > @@ -43,7 +197,6 @@ int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
> >
> >       if (!kvpmu)
> >               return KVM_INSN_EXIT_TO_USER_SPACE;
> > -     //TODO: Should we check if vcpu pmu is initialized or not!
> >       if (wr_mask)
> >               return KVM_INSN_ILLEGAL_TRAP;
> >       cidx = csr_num - CSR_CYCLE;
> > @@ -81,14 +234,62 @@ int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
> >  int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
> >                                unsigned long ctr_mask, unsigned long flag, uint64_t ival)
> >  {
> > -     /* TODO */
> > +     struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +     int i, num_ctrs, pmc_index;
> > +     struct kvm_pmc *pmc;
> > +
> > +     num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> > +     if (ctr_base + __fls(ctr_mask) >= num_ctrs)
> > +             return -EINVAL;
> > +
> > +     /* Start the counters that have been configured and requested by the guest */
> > +     for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
> > +             pmc_index = i + ctr_base;
> > +             if (!test_bit(pmc_index, kvpmu->used_pmc))
> > +                     continue;
> > +             pmc = &kvpmu->pmc[pmc_index];
> > +             if (flag & SBI_PMU_START_FLAG_SET_INIT_VALUE)
> > +                     pmc->counter_val = ival;
> > +             if (pmc->perf_event) {
> > +                     perf_event_period(pmc->perf_event, pmu_get_sample_period(pmc));
> > +                     perf_event_enable(pmc->perf_event);
> > +             }
>
> What about checking the "SBI_ERR_ALREADY_STARTED" condition?
>

pmu start and stop functions in core framework return void. Thus, perf
core can not check any error code
returning from pmu start/stop. We can not check SBI_ERR_ALREADY_STARTED/STOPPED.
It does maintain a state PERF_EVENT_STATE_ACTIVE(once
enabled)/PERF_EVENT_STATE_OFF(disabled).
We can check those states but those states can be invalid for other
reasons. A debug message can be printed
if enabling/disabling fails though.

However, KVM pmu implementation should return SBI_ERR_ALREADY_STARTED/STOPPED if
the event state is not accurate before enabling/disabling the event. I
have added that.

This brings up another generic error returning problem in KVM SBI
land. Usually, SBI error code numbers do not
align with Linux error codes to accommodate other operating systems.
However, most of the SBI error codes
have 1-1 relationship with the Linux error code.
Thus, kvm internal code returns a Linux specific error code and
vcpu_sbi will map those to SBI error code using
kvm_linux_err_map_sbi.

However, this will not work for SBI_ERR_ALREADY_STARTED/STOPPED as
there are no corresponding
Linux specific error codes. We can directly return the SBI error codes
from vcpu_pmu.c and modify the
kvm_linux_err_map_sbi to pass through those. In that case, we can't
map any linux error code that
collides with SBI error code. Any other ideas to handle this case ?


> > +     }
> > +
> >       return 0;
> >  }
> >
> >  int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
> >                                unsigned long ctr_mask, unsigned long flag)
> >  {
> > -     /* TODO */
> > +     struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +     int i, num_ctrs, pmc_index;
> > +     u64 enabled, running;
> > +     struct kvm_pmc *pmc;
> > +
> > +     num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> > +     if ((ctr_base + __fls(ctr_mask)) >= num_ctrs)
> > +             return -EINVAL;
> > +
> > +     /* Stop the counters that have been configured and requested by the guest */
> > +     for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
> > +             pmc_index = i + ctr_base;
> > +             if (!test_bit(pmc_index, kvpmu->used_pmc))
> > +                     continue;
> > +             pmc = &kvpmu->pmc[pmc_index];
> > +             if (pmc->perf_event) {
> > +                     /* Stop counting the counter */
> > +                     perf_event_disable(pmc->perf_event);
> > +                     if (flag & SBI_PMU_STOP_FLAG_RESET) {
> > +                             /* Relase the counter if this is a reset request */
> > +                             pmc->counter_val += perf_event_read_value(pmc->perf_event,
> > +                                                                       &enabled, &running);
> > +                             pmu_release_perf_event(pmc);
> > +                             clear_bit(pmc_index, kvpmu->used_pmc);
> > +                     }
>
> What about checking the "SBI_ERR_ALREADY_STOPPED" condition?
>

> > +             }
> > +     }
> > +
> >       return 0;
> >  }
> >
> > @@ -96,14 +297,85 @@ int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba
> >                                    unsigned long ctr_mask, unsigned long flag,
> >                                    unsigned long eidx, uint64_t edata)
> >  {
> > -     /* TODO */
> > -     return 0;
> > +     struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +     struct perf_event *event;
> > +     struct perf_event_attr attr;
> > +     int num_ctrs, ctr_idx;
> > +     u32 etype = pmu_get_perf_event_type(eidx);
> > +     u64 config;
> > +     struct kvm_pmc *pmc;
> > +
> > +     num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
> > +     if ((etype == PERF_TYPE_MAX) || ((ctr_base + __fls(ctr_mask)) >= num_ctrs))
>
> nit: Unnecessary ()
>

Fixed.

> > +             return -EINVAL;
> > +
> > +     if (pmu_is_fw_event(eidx))
> > +             return -EOPNOTSUPP;
>
> nit: need blank line here
>

Fixed.

> > +     /*
> > +      * SKIP_MATCH flag indicates the caller is aware of the assigned counter
> > +      * for this event. Just do a sanity check if it already marked used.
> > +      */
> > +     if (flag & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
> > +             if (!test_bit(ctr_base, kvpmu->used_pmc))
> > +                     return -EINVAL;
> > +             ctr_idx = ctr_base;
> > +             goto match_done;
> > +     }
> > +
> > +     ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
> > +     if (ctr_idx < 0)
> > +             return -EOPNOTSUPP;
> > +
> > +match_done:
> > +     pmc = &kvpmu->pmc[ctr_idx];
> > +     pmu_release_perf_event(pmc);
> > +     pmc->idx = ctr_idx;
> > +
> > +     config = pmu_get_perf_event_config(eidx, edata);
> > +     memset(&attr, 0, sizeof(struct perf_event_attr));
> > +     attr.type = etype;
> > +     attr.size = sizeof(attr);
> > +     attr.pinned = true;
> > +
> > +     /*
> > +      * It should never reach here if the platform doesn't support sscofpmf extensio
> > +      * as mode filtering won't work without it.
> > +      */
> > +     attr.exclude_host = true;
> > +     attr.exclude_hv = true;
> > +     attr.exclude_user = flag & SBI_PMU_CFG_FLAG_SET_UINH ? 1 : 0;
> > +     attr.exclude_kernel = flag & SBI_PMU_CFG_FLAG_SET_SINH ? 1 : 0;
>
> nit: can use !!(flag & SBI_PMU_CFG_FLAG_SET_UINH)

Fixed.

>
> > +     attr.config = config;
> > +     attr.config1 = RISCV_KVM_PMU_CONFIG1_GUEST_EVENTS;
> > +     if (flag & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
> > +             //TODO: Do we really want to clear the value in hardware counter
> > +             pmc->counter_val = 0;
> > +     }
>
> nit: add blank line

Fixed.

>
> > +     /*
> > +      * Set the default sample_period for now. The guest specified value
> > +      * will be updated in the start call.
> > +      */
> > +     attr.sample_period = pmu_get_sample_period(pmc);
> > +
> > +     event = perf_event_create_kernel_counter(&attr, -1, current, NULL, pmc);
> > +     if (IS_ERR(event)) {
> > +             pr_err("kvm pmu event creation failed event %pe for eidx %lx\n", event, eidx);
> > +             return -EOPNOTSUPP;
> > +     }
> > +
> > +     set_bit(ctr_idx, kvpmu->used_pmc);
> > +     pmc->perf_event = event;
> > +     if (flag & SBI_PMU_CFG_FLAG_AUTO_START)
> > +             perf_event_enable(pmc->perf_event);
> > +
> > +     return ctr_idx;
> >  }
> >
> >  int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
> >  {
> >       int i = 0, num_hw_ctrs, num_fw_ctrs, hpm_width;
> >       struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +     struct kvm_pmc *pmc;
> >
> >       if (!kvpmu)
> >               return -EINVAL;
> > @@ -120,6 +392,7 @@ int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
> >               return -EINVAL;
> >       }
> >
> > +     bitmap_zero(kvpmu->used_pmc, RISCV_MAX_COUNTERS);
> >       kvpmu->num_hw_ctrs = num_hw_ctrs;
> >       kvpmu->num_fw_ctrs = num_fw_ctrs;
> >       /*
> > @@ -132,38 +405,49 @@ int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
> >               /* TIME CSR shouldn't be read from perf interface */
> >               if (i == 1)
> >                       continue;
> > -             kvpmu->pmc[i].idx = i;
> > -             kvpmu->pmc[i].vcpu = vcpu;
> > +             pmc = &kvpmu->pmc[i];
>
> Better to introduce this with the patch that introduced this loop.
>

Sure. Done.

> > +             pmc->idx = i;
> > +             pmc->counter_val = 0;
> > +             pmc->vcpu = vcpu;
> >               if (i < kvpmu->num_hw_ctrs) {
> >                       kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_HW;
> >                       if (i < 3)
> >                               /* CY, IR counters */
> > -                             kvpmu->pmc[i].cinfo.width = 63;
> > +                             pmc->cinfo.width = 63;
> >                       else
> > -                             kvpmu->pmc[i].cinfo.width = hpm_width;
> > +                             pmc->cinfo.width = hpm_width;
> >                       /*
> >                        * The CSR number doesn't have any relation with the logical
> >                        * hardware counters. The CSR numbers are encoded sequentially
> >                        * to avoid maintaining a map between the virtual counter
> >                        * and CSR number.
> >                        */
> > -                     kvpmu->pmc[i].cinfo.csr = CSR_CYCLE + i;
> > +                     pmc->cinfo.csr = CSR_CYCLE + i;
> >               } else {
> > -                     kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_FW;
> > -                     kvpmu->pmc[i].cinfo.width = BITS_PER_LONG - 1;
> > +                     pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
> > +                     pmc->cinfo.width = BITS_PER_LONG - 1;
> >               }
> >       }
> >
> >       return 0;
> >  }
> >
> > -void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
> > +void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
> >  {
> > -     /* TODO */
> > +     struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
> > +     struct kvm_pmc *pmc;
> > +     int i;
> > +
> > +     if (!kvpmu)
> > +             return;
> > +
> > +     for_each_set_bit(i, kvpmu->used_pmc, RISCV_MAX_COUNTERS) {
> > +             pmc = &kvpmu->pmc[i];
> > +             pmu_release_perf_event(pmc);
> > +     }
> >  }
> >
> > -void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
> > +void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
> >  {
> > -     /* TODO */
> > +     kvm_riscv_vcpu_pmu_deinit(vcpu);
> >  }
> > -
> > --
> > 2.25.1
> >
>
> Thanks,
> drew
Andrew Jones Nov. 23, 2022, 2:22 p.m. UTC | #5
On Tue, Nov 22, 2022 at 04:45:16PM -0800, Atish Patra wrote:
...
> This brings up another generic error returning problem in KVM SBI
> land. Usually, SBI error code numbers do not
> align with Linux error codes to accommodate other operating systems.
> However, most of the SBI error codes
> have 1-1 relationship with the Linux error code.
> Thus, kvm internal code returns a Linux specific error code and
> vcpu_sbi will map those to SBI error code using
> kvm_linux_err_map_sbi.
> 
> However, this will not work for SBI_ERR_ALREADY_STARTED/STOPPED as
> there are no corresponding
> Linux specific error codes. We can directly return the SBI error codes
> from vcpu_pmu.c and modify the
> kvm_linux_err_map_sbi to pass through those. In that case, we can't
> map any linux error code that
> collides with SBI error code. Any other ideas to handle this case ?
>

It seems like we should drop kvm_linux_err_map_sbi() and add another
parameter to kvm_vcpu_sbi_extension.handler for the SBI error. Another
option is to continue mapping SBI errors to Linux errors, e.g.
SBI_ERR_ALREADY_STARTED == EBUSY, but that may not be too easy in
all cases and the errors become ambiguous, as we can't tell if the
Linux implementation generated the error or if the SBI call did.

Thanks,
drew
Atish Patra Dec. 2, 2022, 9:08 a.m. UTC | #6
On Wed, Nov 23, 2022 at 6:22 AM Andrew Jones <ajones@ventanamicro.com> wrote:
>
> On Tue, Nov 22, 2022 at 04:45:16PM -0800, Atish Patra wrote:
> ...
> > This brings up another generic error returning problem in KVM SBI
> > land. Usually, SBI error code numbers do not
> > align with Linux error codes to accommodate other operating systems.
> > However, most of the SBI error codes
> > have 1-1 relationship with the Linux error code.
> > Thus, kvm internal code returns a Linux specific error code and
> > vcpu_sbi will map those to SBI error code using
> > kvm_linux_err_map_sbi.
> >
> > However, this will not work for SBI_ERR_ALREADY_STARTED/STOPPED as
> > there are no corresponding
> > Linux specific error codes. We can directly return the SBI error codes
> > from vcpu_pmu.c and modify the
> > kvm_linux_err_map_sbi to pass through those. In that case, we can't
> > map any linux error code that
> > collides with SBI error code. Any other ideas to handle this case ?
> >
>
> It seems like we should drop kvm_linux_err_map_sbi() and add another
> parameter to kvm_vcpu_sbi_extension.handler for the SBI error. Another

That will just move the problem from the generic SBI layer to
extension specific layer.
The root problem remains the same as we can't expect the individual
extension to return
a valid linux specific error code.

Maybe we can relax that requirement. Thus, any extension that has
additional SBI error codes
may opt to return SBI error codes directly. For example, PMU extension
implementation will
directly SBI specific error codes from arch/riscv/kvm/vcpu_pmu.c. In
future, there will be other
extensions (e.g TEE) will have many more error codes that can leverage
this as well.

Does that sound reasonable ?

> option is to continue mapping SBI errors to Linux errors, e.g.
> SBI_ERR_ALREADY_STARTED == EBUSY, but that may not be too easy in
> all cases and the errors become ambiguous, as we can't tell if the
> Linux implementation generated the error or if the SBI call did.
>

We can't distinguish between SBI_ERR_ALREADY_STARTED/STOPPED in that case.

> Thanks,
> drew
Andrew Jones Dec. 2, 2022, 11:37 a.m. UTC | #7
On Fri, Dec 02, 2022 at 01:08:47AM -0800, Atish Patra wrote:
> On Wed, Nov 23, 2022 at 6:22 AM Andrew Jones <ajones@ventanamicro.com> wrote:
> >
> > On Tue, Nov 22, 2022 at 04:45:16PM -0800, Atish Patra wrote:
> > ...
> > > This brings up another generic error returning problem in KVM SBI
> > > land. Usually, SBI error code numbers do not
> > > align with Linux error codes to accommodate other operating systems.
> > > However, most of the SBI error codes
> > > have 1-1 relationship with the Linux error code.
> > > Thus, kvm internal code returns a Linux specific error code and
> > > vcpu_sbi will map those to SBI error code using
> > > kvm_linux_err_map_sbi.
> > >
> > > However, this will not work for SBI_ERR_ALREADY_STARTED/STOPPED as
> > > there are no corresponding
> > > Linux specific error codes. We can directly return the SBI error codes
> > > from vcpu_pmu.c and modify the
> > > kvm_linux_err_map_sbi to pass through those. In that case, we can't
> > > map any linux error code that
> > > collides with SBI error code. Any other ideas to handle this case ?
> > >
> >
> > It seems like we should drop kvm_linux_err_map_sbi() and add another
> > parameter to kvm_vcpu_sbi_extension.handler for the SBI error. Another
> 
> That will just move the problem from the generic SBI layer to
> extension specific layer.
> The root problem remains the same as we can't expect the individual
> extension to return
> a valid linux specific error code.

I'm saying we return both from the extension specific layer, particularly
because only the extension specific layer knows what it should return.
KVM's SBI handlers currently have a return value and *out_val. *out_val
maps directly to SBI's sbiret.value, but the return value does not map to
SBI's sbiret.error. But, all we have to do is add *error_val to the
parameters for the extension handler to get it. Then, cp->a0 should be set
to that, not the return value.

> 
> Maybe we can relax that requirement. Thus, any extension that has
> additional SBI error codes
> may opt to return SBI error codes directly. For example, PMU extension
> implementation will
> directly SBI specific error codes from arch/riscv/kvm/vcpu_pmu.c. In
> future, there will be other
> extensions (e.g TEE) will have many more error codes that can leverage
> this as well.
> 
> Does that sound reasonable ?

I think we need both the Linux return and sbiret.error. The return value
indicates a problem *with* the emulation, while the new parameter I'm
proposing (*error_val) is the return value *of* the emulation. Normally
the Linux return value will be zero (a successful Linux call) even when
emulating a failure (*error_val != SBI_SUCCESS). When the return value
is not zero, then there's something wrong in KVM and the return value
should be propagated to userspace. We could also set the exit_reason to
KVM_EXIT_INTERNAL_ERROR, but KVM_EXIT_UNKNOWN is probably fine too.

> 
> > option is to continue mapping SBI errors to Linux errors, e.g.
> > SBI_ERR_ALREADY_STARTED == EBUSY, but that may not be too easy in
> > all cases and the errors become ambiguous, as we can't tell if the
> > Linux implementation generated the error or if the SBI call did.
> >
> 
> We can't distinguish between SBI_ERR_ALREADY_STARTED/STOPPED in that case.

That's why I only suggested using EBUSY for STARTED. Mapping STOPPED
was left as an exercise for the reader :-)

Thanks,
drew
Sean Christopherson Dec. 2, 2022, 5:09 p.m. UTC | #8
On Mon, Jul 18, 2022, Atish Patra wrote:
> RISC-V SBI PMU & Sscofpmf ISA extension allows supporting perf in
> the virtualization enviornment as well. KVM implementation
> relies on SBI PMU extension for most of the part while traps
> & emulates the CSRs read for counter access.

For the benefit of non-RISCV people, the changelog (and documentation?) should
explain why RISC-V doesn't need to tap into kvm_register_perf_callbacks().
Presumably there's something in the "RISC-V SBI PMU & Sscofpmf ISA extension" spec
that allows hardware to differentiate between events that are for guest vs. host?
Atish Patra Dec. 7, 2022, 8:06 a.m. UTC | #9
On Fri, Dec 2, 2022 at 9:09 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Mon, Jul 18, 2022, Atish Patra wrote:
> > RISC-V SBI PMU & Sscofpmf ISA extension allows supporting perf in
> > the virtualization enviornment as well. KVM implementation
> > relies on SBI PMU extension for most of the part while traps
> > & emulates the CSRs read for counter access.
>
> For the benefit of non-RISCV people, the changelog (and documentation?) should
> explain why RISC-V doesn't need to tap into kvm_register_perf_callbacks().

As per my understanding, kvm_register_perf_callbacks is only useful
during event sampling for guests. Please let me know if I missed
something.
This series doesn't support sampling and guest counter overflow
interrupt yet.  That's why kvm_register_perf_callbacks support is
missing.
I will add kvm_register_perf_callbacks in the next revision along with
interrupt support.

> Presumably there's something in the "RISC-V SBI PMU & Sscofpmf ISA extension" spec
> that allows hardware to differentiate between events that are for guest vs. host?

Not directly. The Sscofpmf extension does have privilege mode specific
filtering bits[1] i.e. VSINH and VUINH which will indicate if the
events are specific to guest.
But that is only true if the hypervisor has enabled those bits. As I
said above, RISC-V do need to tap into kvm_register_perf_callbacks as
well.

[1] https://drive.google.com/file/d/171j4jFjIkKdj5LWcExphq4xG_2sihbfd/edit
Atish Patra Dec. 7, 2022, 8:49 a.m. UTC | #10
On Fri, Dec 2, 2022 at 3:37 AM Andrew Jones <ajones@ventanamicro.com> wrote:
>
> On Fri, Dec 02, 2022 at 01:08:47AM -0800, Atish Patra wrote:
> > On Wed, Nov 23, 2022 at 6:22 AM Andrew Jones <ajones@ventanamicro.com> wrote:
> > >
> > > On Tue, Nov 22, 2022 at 04:45:16PM -0800, Atish Patra wrote:
> > > ...
> > > > This brings up another generic error returning problem in KVM SBI
> > > > land. Usually, SBI error code numbers do not
> > > > align with Linux error codes to accommodate other operating systems.
> > > > However, most of the SBI error codes
> > > > have 1-1 relationship with the Linux error code.
> > > > Thus, kvm internal code returns a Linux specific error code and
> > > > vcpu_sbi will map those to SBI error code using
> > > > kvm_linux_err_map_sbi.
> > > >
> > > > However, this will not work for SBI_ERR_ALREADY_STARTED/STOPPED as
> > > > there are no corresponding
> > > > Linux specific error codes. We can directly return the SBI error codes
> > > > from vcpu_pmu.c and modify the
> > > > kvm_linux_err_map_sbi to pass through those. In that case, we can't
> > > > map any linux error code that
> > > > collides with SBI error code. Any other ideas to handle this case ?
> > > >
> > >
> > > It seems like we should drop kvm_linux_err_map_sbi() and add another
> > > parameter to kvm_vcpu_sbi_extension.handler for the SBI error. Another
> >
> > That will just move the problem from the generic SBI layer to
> > extension specific layer.
> > The root problem remains the same as we can't expect the individual
> > extension to return
> > a valid linux specific error code.
>
> I'm saying we return both from the extension specific layer, particularly
> because only the extension specific layer knows what it should return.
> KVM's SBI handlers currently have a return value and *out_val. *out_val
> maps directly to SBI's sbiret.value, but the return value does not map to
> SBI's sbiret.error. But, all we have to do is add *error_val to the
> parameters for the extension handler to get it. Then, cp->a0 should be set
> to that, not the return value.
>

Ahh. Yes. That will solve this issue.

> >
> > Maybe we can relax that requirement. Thus, any extension that has
> > additional SBI error codes
> > may opt to return SBI error codes directly. For example, PMU extension
> > implementation will
> > directly SBI specific error codes from arch/riscv/kvm/vcpu_pmu.c. In
> > future, there will be other
> > extensions (e.g TEE) will have many more error codes that can leverage
> > this as well.
> >
> > Does that sound reasonable ?
>
> I think we need both the Linux return and sbiret.error. The return value
> indicates a problem *with* the emulation, while the new parameter I'm
> proposing (*error_val) is the return value *of* the emulation. Normally
> the Linux return value will be zero (a successful Linux call) even when
> emulating a failure (*error_val != SBI_SUCCESS). When the return value
> is not zero, then there's something wrong in KVM and the return value
> should be propagated to userspace. We could also set the exit_reason to
> KVM_EXIT_INTERNAL_ERROR, but KVM_EXIT_UNKNOWN is probably fine too.
>

Agreed. I revise the series accordingly. Thanks!

> >
> > > option is to continue mapping SBI errors to Linux errors, e.g.
> > > SBI_ERR_ALREADY_STARTED == EBUSY, but that may not be too easy in
> > > all cases and the errors become ambiguous, as we can't tell if the
> > > Linux implementation generated the error or if the SBI call did.
> > >
> >
> > We can't distinguish between SBI_ERR_ALREADY_STARTED/STOPPED in that case.
>
> That's why I only suggested using EBUSY for STARTED. Mapping STOPPED
> was left as an exercise for the reader :-)
>
> Thanks,
> drew
Sean Christopherson Dec. 7, 2022, 4:31 p.m. UTC | #11
On Wed, Dec 07, 2022, Atish Patra wrote:
> On Fri, Dec 2, 2022 at 9:09 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Mon, Jul 18, 2022, Atish Patra wrote:
> > > RISC-V SBI PMU & Sscofpmf ISA extension allows supporting perf in
> > > the virtualization enviornment as well. KVM implementation
> > > relies on SBI PMU extension for most of the part while traps
> > > & emulates the CSRs read for counter access.
> >
> > For the benefit of non-RISCV people, the changelog (and documentation?) should
> > explain why RISC-V doesn't need to tap into kvm_register_perf_callbacks().
> 
> As per my understanding, kvm_register_perf_callbacks is only useful
> during event sampling for guests. Please let me know if I missed
> something.
> This series doesn't support sampling and guest counter overflow interrupt yet.
> That's why kvm_register_perf_callbacks support is missing.

Ah, I missed that connection in the cover letter.

In the future, if a patch adds partial support for a thing/feature, it's very
helpful to call that out in the lack shortlog and changelog, even for RFCs.  E.g.
adding a single word in the shortlog and sentence or two in the changelog doesn't
take much time on your end, and helps avoid cases like this where drive-by reviewers
like me from cause a fuss about non-issues.

 RISC-V: KVM: Implement partial perf support

 ...

 Counter overflow and interrupts are not supported as the relevant
 architectural specifications are still under discussion.

Thanks!
Atish Patra Dec. 8, 2022, 1:11 a.m. UTC | #12
On Wed, Dec 7, 2022 at 8:31 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Dec 07, 2022, Atish Patra wrote:
> > On Fri, Dec 2, 2022 at 9:09 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Mon, Jul 18, 2022, Atish Patra wrote:
> > > > RISC-V SBI PMU & Sscofpmf ISA extension allows supporting perf in
> > > > the virtualization enviornment as well. KVM implementation
> > > > relies on SBI PMU extension for most of the part while traps
> > > > & emulates the CSRs read for counter access.
> > >
> > > For the benefit of non-RISCV people, the changelog (and documentation?) should
> > > explain why RISC-V doesn't need to tap into kvm_register_perf_callbacks().
> >
> > As per my understanding, kvm_register_perf_callbacks is only useful
> > during event sampling for guests. Please let me know if I missed
> > something.
> > This series doesn't support sampling and guest counter overflow interrupt yet.
> > That's why kvm_register_perf_callbacks support is missing.
>
> Ah, I missed that connection in the cover letter.
>
> In the future, if a patch adds partial support for a thing/feature, it's very
> helpful to call that out in the lack shortlog and changelog, even for RFCs.  E.g.
> adding a single word in the shortlog and sentence or two in the changelog doesn't
> take much time on your end, and helps avoid cases like this where drive-by reviewers
> like me from cause a fuss about non-issues.
>

Absolutely. I will update the commit text in v2 accordingly.

>  RISC-V: KVM: Implement partial perf support
>
>  ...
>
>  Counter overflow and interrupts are not supported as the relevant
>  architectural specifications are still under discussion.
>
> Thanks!



--
Regards,
Atish
diff mbox series

Patch

diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
index 5434051f495d..278c261efad3 100644
--- a/arch/riscv/kvm/vcpu_pmu.c
+++ b/arch/riscv/kvm/vcpu_pmu.c
@@ -11,9 +11,163 @@ 
 #include <linux/kvm_host.h>
 #include <linux/perf/riscv_pmu.h>
 #include <asm/csr.h>
+#include <asm/bitops.h>
 #include <asm/kvm_vcpu_pmu.h>
 #include <linux/kvm_host.h>
 
+#define get_event_type(x) ((x & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
+#define get_event_code(x) (x & SBI_PMU_EVENT_IDX_CODE_MASK)
+
+static inline u64 pmu_get_sample_period(struct kvm_pmc *pmc)
+{
+	u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
+	u64 sample_period;
+
+	if (!pmc->counter_val)
+		sample_period = counter_val_mask;
+	else
+		sample_period = pmc->counter_val & counter_val_mask;
+
+	return sample_period;
+}
+
+static u32 pmu_get_perf_event_type(unsigned long eidx)
+{
+	enum sbi_pmu_event_type etype = get_event_type(eidx);
+	u32 type;
+
+	if (etype == SBI_PMU_EVENT_TYPE_HW)
+		type = PERF_TYPE_HARDWARE;
+	else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
+		type = PERF_TYPE_HW_CACHE;
+	else if (etype == SBI_PMU_EVENT_TYPE_RAW || etype == SBI_PMU_EVENT_TYPE_FW)
+		type = PERF_TYPE_RAW;
+	else
+		type = PERF_TYPE_MAX;
+
+	return type;
+}
+
+static inline bool pmu_is_fw_event(unsigned long eidx)
+{
+	enum sbi_pmu_event_type etype = get_event_type(eidx);
+
+	return (etype == SBI_PMU_EVENT_TYPE_FW) ? true : false;
+}
+
+static void pmu_release_perf_event(struct kvm_pmc *pmc)
+{
+	if (pmc->perf_event) {
+		perf_event_disable(pmc->perf_event);
+		perf_event_release_kernel(pmc->perf_event);
+		pmc->perf_event = NULL;
+	}
+}
+
+static u64 pmu_get_perf_event_hw_config(u32 sbi_event_code)
+{
+	/* SBI PMU HW event code is offset by 1 from perf hw event codes */
+	return (u64)sbi_event_code - 1;
+}
+
+static u64 pmu_get_perf_event_cache_config(u32 sbi_event_code)
+{
+	u64 config = U64_MAX;
+	unsigned int cache_type, cache_op, cache_result;
+
+	/* All the cache event masks lie within 0xFF. No separate masking is necesssary */
+	cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >> 3;
+	cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >> 1;
+	cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
+
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
+	    cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+	    cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		goto out;
+	config = cache_type | (cache_op << 8) | (cache_result << 16);
+out:
+	return config;
+}
+
+static u64 pmu_get_perf_event_config(unsigned long eidx, uint64_t edata)
+{
+	enum sbi_pmu_event_type etype = get_event_type(eidx);
+	u32 ecode = get_event_code(eidx);
+	u64 config = U64_MAX;
+
+	if (etype == SBI_PMU_EVENT_TYPE_HW)
+		config = pmu_get_perf_event_hw_config(ecode);
+	else if (etype == SBI_PMU_EVENT_TYPE_CACHE)
+		config = pmu_get_perf_event_cache_config(ecode);
+	else if (etype == SBI_PMU_EVENT_TYPE_RAW)
+		config = edata & RISCV_PMU_RAW_EVENT_MASK;
+	else if ((etype == SBI_PMU_EVENT_TYPE_FW) && (ecode < SBI_PMU_FW_MAX))
+		config = (1ULL << 63) | ecode;
+
+	return config;
+}
+
+static int pmu_get_fixed_pmc_index(unsigned long eidx)
+{
+	u32 etype = pmu_get_perf_event_type(eidx);
+	u32 ecode = get_event_code(eidx);
+	int ctr_idx;
+
+	if (etype != SBI_PMU_EVENT_TYPE_HW)
+		return -EINVAL;
+
+	if (ecode == SBI_PMU_HW_CPU_CYCLES)
+		ctr_idx = 0;
+	else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
+		ctr_idx = 2;
+	else
+		return -EINVAL;
+
+	return ctr_idx;
+}
+
+static int pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
+					  unsigned long cbase, unsigned long cmask)
+{
+	int ctr_idx = -1;
+	int i, pmc_idx;
+	int min, max;
+
+	if (pmu_is_fw_event(eidx)) {
+		/* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
+		min = kvpmu->num_hw_ctrs;
+		max = min + kvpmu->num_fw_ctrs;
+	} else {
+		/* First 3 counters are reserved for fixed counters */
+		min = 3;
+		max = kvpmu->num_hw_ctrs;
+	}
+
+	for_each_set_bit(i, &cmask, BITS_PER_LONG) {
+		pmc_idx = i + cbase;
+		if ((pmc_idx >= min && pmc_idx < max) &&
+		    !test_bit(pmc_idx, kvpmu->used_pmc)) {
+			ctr_idx = pmc_idx;
+			break;
+		}
+	}
+
+	return ctr_idx;
+}
+
+static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
+			     unsigned long cbase, unsigned long cmask)
+{
+	int ret;
+
+	/* Fixed counters need to be have fixed mapping as they have different width */
+	ret = pmu_get_fixed_pmc_index(eidx);
+	if (ret >= 0)
+		return ret;
+
+	return pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
+}
+
 int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
 				unsigned long *out_val)
 {
@@ -43,7 +197,6 @@  int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
 
 	if (!kvpmu)
 		return KVM_INSN_EXIT_TO_USER_SPACE;
-	//TODO: Should we check if vcpu pmu is initialized or not!
 	if (wr_mask)
 		return KVM_INSN_ILLEGAL_TRAP;
 	cidx = csr_num - CSR_CYCLE;
@@ -81,14 +234,62 @@  int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
 int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
 				 unsigned long ctr_mask, unsigned long flag, uint64_t ival)
 {
-	/* TODO */
+	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+	int i, num_ctrs, pmc_index;
+	struct kvm_pmc *pmc;
+
+	num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
+	if (ctr_base + __fls(ctr_mask) >= num_ctrs)
+		return -EINVAL;
+
+	/* Start the counters that have been configured and requested by the guest */
+	for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
+		pmc_index = i + ctr_base;
+		if (!test_bit(pmc_index, kvpmu->used_pmc))
+			continue;
+		pmc = &kvpmu->pmc[pmc_index];
+		if (flag & SBI_PMU_START_FLAG_SET_INIT_VALUE)
+			pmc->counter_val = ival;
+		if (pmc->perf_event) {
+			perf_event_period(pmc->perf_event, pmu_get_sample_period(pmc));
+			perf_event_enable(pmc->perf_event);
+		}
+	}
+
 	return 0;
 }
 
 int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
 				 unsigned long ctr_mask, unsigned long flag)
 {
-	/* TODO */
+	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+	int i, num_ctrs, pmc_index;
+	u64 enabled, running;
+	struct kvm_pmc *pmc;
+
+	num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
+	if ((ctr_base + __fls(ctr_mask)) >= num_ctrs)
+		return -EINVAL;
+
+	/* Stop the counters that have been configured and requested by the guest */
+	for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
+		pmc_index = i + ctr_base;
+		if (!test_bit(pmc_index, kvpmu->used_pmc))
+			continue;
+		pmc = &kvpmu->pmc[pmc_index];
+		if (pmc->perf_event) {
+			/* Stop counting the counter */
+			perf_event_disable(pmc->perf_event);
+			if (flag & SBI_PMU_STOP_FLAG_RESET) {
+				/* Relase the counter if this is a reset request */
+				pmc->counter_val += perf_event_read_value(pmc->perf_event,
+									  &enabled, &running);
+				pmu_release_perf_event(pmc);
+				clear_bit(pmc_index, kvpmu->used_pmc);
+			}
+		}
+	}
+
 	return 0;
 }
 
@@ -96,14 +297,85 @@  int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_ba
 				     unsigned long ctr_mask, unsigned long flag,
 				     unsigned long eidx, uint64_t edata)
 {
-	/* TODO */
-	return 0;
+	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+	struct perf_event *event;
+	struct perf_event_attr attr;
+	int num_ctrs, ctr_idx;
+	u32 etype = pmu_get_perf_event_type(eidx);
+	u64 config;
+	struct kvm_pmc *pmc;
+
+	num_ctrs = kvpmu->num_fw_ctrs + kvpmu->num_hw_ctrs;
+	if ((etype == PERF_TYPE_MAX) || ((ctr_base + __fls(ctr_mask)) >= num_ctrs))
+		return -EINVAL;
+
+	if (pmu_is_fw_event(eidx))
+		return -EOPNOTSUPP;
+	/*
+	 * SKIP_MATCH flag indicates the caller is aware of the assigned counter
+	 * for this event. Just do a sanity check if it already marked used.
+	 */
+	if (flag & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
+		if (!test_bit(ctr_base, kvpmu->used_pmc))
+			return -EINVAL;
+		ctr_idx = ctr_base;
+		goto match_done;
+	}
+
+	ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
+	if (ctr_idx < 0)
+		return -EOPNOTSUPP;
+
+match_done:
+	pmc = &kvpmu->pmc[ctr_idx];
+	pmu_release_perf_event(pmc);
+	pmc->idx = ctr_idx;
+
+	config = pmu_get_perf_event_config(eidx, edata);
+	memset(&attr, 0, sizeof(struct perf_event_attr));
+	attr.type = etype;
+	attr.size = sizeof(attr);
+	attr.pinned = true;
+
+	/*
+	 * It should never reach here if the platform doesn't support sscofpmf extensio
+	 * as mode filtering won't work without it.
+	 */
+	attr.exclude_host = true;
+	attr.exclude_hv = true;
+	attr.exclude_user = flag & SBI_PMU_CFG_FLAG_SET_UINH ? 1 : 0;
+	attr.exclude_kernel = flag & SBI_PMU_CFG_FLAG_SET_SINH ? 1 : 0;
+	attr.config = config;
+	attr.config1 = RISCV_KVM_PMU_CONFIG1_GUEST_EVENTS;
+	if (flag & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
+		//TODO: Do we really want to clear the value in hardware counter
+		pmc->counter_val = 0;
+	}
+	/*
+	 * Set the default sample_period for now. The guest specified value
+	 * will be updated in the start call.
+	 */
+	attr.sample_period = pmu_get_sample_period(pmc);
+
+	event = perf_event_create_kernel_counter(&attr, -1, current, NULL, pmc);
+	if (IS_ERR(event)) {
+		pr_err("kvm pmu event creation failed event %pe for eidx %lx\n", event, eidx);
+		return -EOPNOTSUPP;
+	}
+
+	set_bit(ctr_idx, kvpmu->used_pmc);
+	pmc->perf_event = event;
+	if (flag & SBI_PMU_CFG_FLAG_AUTO_START)
+		perf_event_enable(pmc->perf_event);
+
+	return ctr_idx;
 }
 
 int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
 {
 	int i = 0, num_hw_ctrs, num_fw_ctrs, hpm_width;
 	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
 
 	if (!kvpmu)
 		return -EINVAL;
@@ -120,6 +392,7 @@  int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
 		return -EINVAL;
 	}
 
+	bitmap_zero(kvpmu->used_pmc, RISCV_MAX_COUNTERS);
 	kvpmu->num_hw_ctrs = num_hw_ctrs;
 	kvpmu->num_fw_ctrs = num_fw_ctrs;
 	/*
@@ -132,38 +405,49 @@  int kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
 		/* TIME CSR shouldn't be read from perf interface */
 		if (i == 1)
 			continue;
-		kvpmu->pmc[i].idx = i;
-		kvpmu->pmc[i].vcpu = vcpu;
+		pmc = &kvpmu->pmc[i];
+		pmc->idx = i;
+		pmc->counter_val = 0;
+		pmc->vcpu = vcpu;
 		if (i < kvpmu->num_hw_ctrs) {
 			kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_HW;
 			if (i < 3)
 				/* CY, IR counters */
-				kvpmu->pmc[i].cinfo.width = 63;
+				pmc->cinfo.width = 63;
 			else
-				kvpmu->pmc[i].cinfo.width = hpm_width;
+				pmc->cinfo.width = hpm_width;
 			/*
 			 * The CSR number doesn't have any relation with the logical
 			 * hardware counters. The CSR numbers are encoded sequentially
 			 * to avoid maintaining a map between the virtual counter
 			 * and CSR number.
 			 */
-			kvpmu->pmc[i].cinfo.csr = CSR_CYCLE + i;
+			pmc->cinfo.csr = CSR_CYCLE + i;
 		} else {
-			kvpmu->pmc[i].cinfo.type = SBI_PMU_CTR_TYPE_FW;
-			kvpmu->pmc[i].cinfo.width = BITS_PER_LONG - 1;
+			pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
+			pmc->cinfo.width = BITS_PER_LONG - 1;
 		}
 	}
 
 	return 0;
 }
 
-void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
+void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
 {
-	/* TODO */
+	struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	int i;
+
+	if (!kvpmu)
+		return;
+
+	for_each_set_bit(i, kvpmu->used_pmc, RISCV_MAX_COUNTERS) {
+		pmc = &kvpmu->pmc[i];
+		pmu_release_perf_event(pmc);
+	}
 }
 
-void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
+void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
 {
-	/* TODO */
+	kvm_riscv_vcpu_pmu_deinit(vcpu);
 }
-