diff mbox series

[v3,1/5] kvm: x86/pmu: Introduce masked events to the pmu event filter

Message ID 20220709011726.1006267-2-aaronlewis@google.com (mailing list archive)
State New, archived
Headers show
Series kvm: x86/pmu: Introduce and test masked events | expand

Commit Message

Aaron Lewis July 9, 2022, 1:17 a.m. UTC
When building an event list for the pmu event filter, fitting all the
events in the limited space can be a challenge.  It becomes
particularly challenging when trying to include various unit mask
combinations for a particular event the guest is allow to or not allow
to program.  Instead of increasing the size of the list to allow for
these, add a new encoding in the pmu event filter's events field. These
encoded events can then be used to test against the event the guest is
attempting to program to determine if the guest should have access to
it.

The encoded values are: mask, match, and invert.  When filtering events
the mask is applied to the guest's unit mask to see if it matches the
match value (ie: unit_mask & mask == match).  The invert bit can then
be used to exclude events from that match.  For example, if it is easier
to say which events shouldn't be filtered, an encoded event can be set
up to match all possible unit masks for a particular eventsel, then
another encoded event can be set up to match the unit masks that
shouldn't be filtered by setting the invert bit in that encoded event.

This feature is enabled by setting the flags field to
KVM_PMU_EVENT_FLAG_MASKED_EVENTS.

Events can be encoded by using KVM_PMU_EVENT_ENCODE_MASKED_EVENT().

It is an error to have a bit set outside valid encoded bits, and calls
to KVM_SET_PMU_EVENT_FILTER will return -EINVAL in such cases,
including bits that are set in the high nybble[1] for AMD if called on
Intel.

[1] bits 35:32 in the event and bits 11:8 in the eventsel.

Signed-off-by: Aaron Lewis <aaronlewis@google.com>
---
 Documentation/virt/kvm/api.rst         |  52 ++++++++--
 arch/x86/include/asm/kvm-x86-pmu-ops.h |   1 +
 arch/x86/include/uapi/asm/kvm.h        |   8 ++
 arch/x86/kvm/pmu.c                     | 135 ++++++++++++++++++++++---
 arch/x86/kvm/pmu.h                     |   1 +
 arch/x86/kvm/svm/pmu.c                 |  12 +++
 arch/x86/kvm/vmx/pmu_intel.c           |  12 +++
 7 files changed, 203 insertions(+), 18 deletions(-)

Comments

Jim Mattson Aug. 3, 2022, 12:19 a.m. UTC | #1
On Fri, Jul 8, 2022 at 6:17 PM Aaron Lewis <aaronlewis@google.com> wrote:
>
> When building an event list for the pmu event filter, fitting all the
> events in the limited space can be a challenge.  It becomes
> particularly challenging when trying to include various unit mask
> combinations for a particular event the guest is allow to or not allow
> to program.  Instead of increasing the size of the list to allow for
> these, add a new encoding in the pmu event filter's events field. These
> encoded events can then be used to test against the event the guest is
> attempting to program to determine if the guest should have access to
> it.
>
> The encoded values are: mask, match, and invert.  When filtering events
> the mask is applied to the guest's unit mask to see if it matches the
> match value (ie: unit_mask & mask == match).  The invert bit can then
> be used to exclude events from that match.  For example, if it is easier
> to say which events shouldn't be filtered, an encoded event can be set
> up to match all possible unit masks for a particular eventsel, then
> another encoded event can be set up to match the unit masks that
> shouldn't be filtered by setting the invert bit in that encoded event.
>
> This feature is enabled by setting the flags field to
> KVM_PMU_EVENT_FLAG_MASKED_EVENTS.
>
> Events can be encoded by using KVM_PMU_EVENT_ENCODE_MASKED_EVENT().
>
> It is an error to have a bit set outside valid encoded bits, and calls
> to KVM_SET_PMU_EVENT_FILTER will return -EINVAL in such cases,
> including bits that are set in the high nybble[1] for AMD if called on
> Intel.
>
> [1] bits 35:32 in the event and bits 11:8 in the eventsel.

I think there is some confusion in the documentation and the code
regarding what an 'eventsel' is. Yes, Intel started it, with
"performance event select registers" that contain an "event select"
field, but the 64-bit object you refer to as an 'event' here in the
commit message is typically referred to as an 'eventsel' in the code
below. Maybe it's too late to avoid confusion, but I'd suggest
referring to the 64-bit object as a "PMU event filter entry," or
something like that.

> Signed-off-by: Aaron Lewis <aaronlewis@google.com>

> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 21614807a2cb..2964f3f15fb5 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -522,6 +522,14 @@ struct kvm_pmu_event_filter {
>  #define KVM_PMU_EVENT_ALLOW 0
>  #define KVM_PMU_EVENT_DENY 1
>
> +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS (1u << 0)

This can be BIT(0).

> +#define KVM_PMU_EVENT_ENCODE_MASKED_EVENT(select, mask, match, invert) \
> +               (((select) & 0xfful) | (((select) & 0xf00ul) << 24) | \
> +               (((mask) & 0xfful) << 24) | \
> +               (((match) & 0xfful) << 8) | \
> +               (((invert) & 0x1ul) << 23))

Please convert the masks and shifts to GENMASK_ULL().

>  /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
>  #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
>  #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index 3f868fed9114..99c02bbb8f32 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -197,14 +197,106 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
>         return true;
>  }
>
> -static int cmp_u64(const void *pa, const void *pb)
> +static inline u64 get_event(u64 eventsel)
> +{
> +       return eventsel & AMD64_EVENTSEL_EVENT;
> +}
> +
> +static inline u8 get_unit_mask(u64 eventsel)
> +{
> +       return (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
> +}
> +
> +static inline u8 get_counter_mask(u64 eventsel)
>  {
> -       u64 a = *(u64 *)pa;
> -       u64 b = *(u64 *)pb;
> +       return (eventsel & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
> +}

In a raw PMU event, this field is the counter mask, but in an event
filter object, it's the "unit-mask mask."
> +
> +static inline bool get_invert_comparison(u64 eventsel)
> +{
> +       return !!(eventsel & ARCH_PERFMON_EVENTSEL_INV);
> +}

You can drop the !! and parentheses. Scalar to boolean conversion does
the right thing.

> +static inline int cmp_safe64(u64 a, u64 b)
> +{
>         return (a > b) - (a < b);
>  }
>
> +static int cmp_eventsel_event(const void *pa, const void *pb)
> +{
> +       return cmp_safe64(*(u64 *)pa & AMD64_EVENTSEL_EVENT,
> +                         *(u64 *)pb & AMD64_EVENTSEL_EVENT);
> +}
> +
> +static int cmp_u64(const void *pa, const void *pb)
> +{
> +       return cmp_safe64(*(u64 *)pa,
> +                         *(u64 *)pb);

Nit: join these lines.

> +}
> +
> +static inline bool is_match(u64 masked_event, u64 eventsel)
> +{
> +       u8 mask = get_counter_mask(masked_event);
> +       u8 match = get_unit_mask(masked_event);
> +       u8 unit_mask = get_unit_mask(eventsel);
> +
> +       return (unit_mask & mask) == match;
> +}
> +
> +static inline bool is_inverted(u64 masked_event)
> +{
> +       return get_invert_comparison(masked_event);
> +}
> +
> +static bool is_filtered(struct kvm_pmu_event_filter *filter, u64 eventsel,
> +                       bool invert)
> +{
> +       u64 key = get_event(eventsel);
> +       u64 *event, *evt;
> +
> +       event = bsearch(&key, filter->events, filter->nevents, sizeof(u64),
> +                       cmp_eventsel_event);
> +
> +       if (event) {
> +               /* Walk the masked events backward looking for a match. */
> +               for (evt = event; evt >= filter->events &&
> +                    get_event(*evt) == get_event(eventsel); evt--)

Replace get_event(eventsel) with key.

> +                       if (is_inverted(*evt) == invert && is_match(*evt, eventsel))
> +                               return true;
> +
> +               /* Walk the masked events forward looking for a match. */
> +               for (evt = event + 1;
> +                    evt < (filter->events + filter->nevents) &&
> +                    get_event(*evt) == get_event(eventsel); evt++)

Replace get_event(eventsel) with key.

> +                       if (is_inverted(*evt) == invert && is_match(*evt, eventsel))
> +                               return true;
> +       }
> +
> +       return false;
> +}
>
> +static bool allowed_by_masked_events(struct kvm_pmu_event_filter *filter,
> +                                    u64 eventsel)
> +{
> +       if (is_filtered(filter, eventsel, /*invert=*/false))
> +               if (!is_filtered(filter, eventsel, /*invert=*/true))

Perhaps you could eliminate the ugly parameter comments if you
maintained the "normal" and inverted entries in separate lists. It
might also speed things up for the common case, assuming that inverted
entries are uncommon.

> +                       return filter->action == KVM_PMU_EVENT_ALLOW;
> +
> +       return filter->action == KVM_PMU_EVENT_DENY;
> +}
> +
> +static bool allowed_by_default_events(struct kvm_pmu_event_filter *filter,
> +                                   u64 eventsel)
> +{
> +       u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
> +
> +       if (bsearch(&key, filter->events, filter->nevents,
> +                   sizeof(u64), cmp_u64))
> +               return filter->action == KVM_PMU_EVENT_ALLOW;
> +
> +       return filter->action == KVM_PMU_EVENT_DENY;
> +}
> +
>  void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
>  {
>         u64 config;
> @@ -226,14 +318,11 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
>
>         filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
>         if (filter) {
> -               __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
> -
> -               if (bsearch(&key, filter->events, filter->nevents,
> -                           sizeof(__u64), cmp_u64))
> -                       allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
> -               else
> -                       allow_event = filter->action == KVM_PMU_EVENT_DENY;
> +               allow_event = (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) ?
> +                       allowed_by_masked_events(filter, eventsel) :
> +                       allowed_by_default_events(filter, eventsel);

If you converted all of the legacy filters into masked filters by
simply setting the mask field to '0xff' when copying from userspace,
you wouldn't need the complexity of two different matching algorithms.

>         }
> +
>         if (!allow_event)
>                 return;
>
> @@ -572,8 +661,22 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
>  }
>  EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
>
> +static int has_invalid_event(struct kvm_pmu_event_filter *filter)
> +{
> +       u64 event_mask;
> +       int i;
> +
> +       event_mask = static_call(kvm_x86_pmu_get_event_mask)(filter->flags);
> +       for (i = 0; i < filter->nevents; i++)
> +               if (filter->events[i] & ~event_mask)
> +                       return true;
> +
> +       return false;
> +}
> +
>  int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
>  {
> +       int (*cmp)(const void *a, const void *b) = cmp_u64;
>         struct kvm_pmu_event_filter tmp, *filter;
>         size_t size;
>         int r;
> @@ -585,7 +688,7 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
>             tmp.action != KVM_PMU_EVENT_DENY)
>                 return -EINVAL;
>
> -       if (tmp.flags != 0)
> +       if (tmp.flags & ~KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
>                 return -EINVAL;
>
>         if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
> @@ -603,10 +706,18 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
>         /* Ensure nevents can't be changed between the user copies. */
>         *filter = tmp;
>
> +       r = -EINVAL;
> +       /* To maintain backwards compatibility don't validate flags == 0. */
> +       if (filter->flags != 0 && has_invalid_event(filter))
> +               goto cleanup;
> +
> +       if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
> +               cmp = cmp_eventsel_event;
> +
>         /*
>          * Sort the in-kernel list so that we can search it with bsearch.
>          */
> -       sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
> +       sort(&filter->events, filter->nevents, sizeof(u64), cmp, NULL);

I don't believe two different comparison functions are necessary. In
the legacy case, when setting up the filter, you should be able to
simply discard any filter entries that have extraneous bits set,
because they will never match.

>         mutex_lock(&kvm->lock);
>         filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
Jim Mattson Aug. 3, 2022, 12:28 p.m. UTC | #2
On Tue, Aug 2, 2022 at 5:19 PM Jim Mattson <jmattson@google.com> wrote:
>
> On Fri, Jul 8, 2022 at 6:17 PM Aaron Lewis <aaronlewis@google.com> wrote:
> >
> > +#define KVM_PMU_EVENT_ENCODE_MASKED_EVENT(select, mask, match, invert) \
> > +               (((select) & 0xfful) | (((select) & 0xf00ul) << 24) | \
> > +               (((mask) & 0xfful) << 24) | \
> > +               (((match) & 0xfful) << 8) | \
> > +               (((invert) & 0x1ul) << 23))
>
> Please convert the masks and shifts to GENMASK_ULL().

Sorry. Ignore this suggestion. It makes no sense.
Aaron Lewis Aug. 4, 2022, 1:36 a.m. UTC | #3
On Tue, Aug 2, 2022 at 5:19 PM Jim Mattson <jmattson@google.com> wrote:
>
> On Fri, Jul 8, 2022 at 6:17 PM Aaron Lewis <aaronlewis@google.com> wrote:
> >
> > This feature is enabled by setting the flags field to
> > KVM_PMU_EVENT_FLAG_MASKED_EVENTS.
> >
> > Events can be encoded by using KVM_PMU_EVENT_ENCODE_MASKED_EVENT().
> >
> > It is an error to have a bit set outside valid encoded bits, and calls
> > to KVM_SET_PMU_EVENT_FILTER will return -EINVAL in such cases,
> > including bits that are set in the high nybble[1] for AMD if called on
> > Intel.
> >
> > [1] bits 35:32 in the event and bits 11:8 in the eventsel.
>
> I think there is some confusion in the documentation and the code
> regarding what an 'eventsel' is. Yes, Intel started it, with
> "performance event select registers" that contain an "event select"
> field, but the 64-bit object you refer to as an 'event' here in the
> commit message is typically referred to as an 'eventsel' in the code

Yeah, it does look like 'eventsel' is more commonly used in the kernel
to refer to the "performance event select register".  I do see a few
locations where 'eventsel' refers to an eventsel's event.  Maybe I can
rename those in a separate series to avoid future confusion, though
another reason I named it that way is because the SDM and APM both
refer to the eventsel's event as an "event select" which naturally
makes sense to call it eventsel.  I figured we'd want to be consistent
with the docs.

If we prefer to call the 64-bit object 'eventsel', then what do we
call the eventsel's event?  Maybe 'eventsel's event' is good enough,
however, it's unfortunate "event select" is overloaded like it is
because I don't feel the need to say "eventsel's unit mask".

I'm open to other suggestions as well.  There's got to be something
better than that.

> below. Maybe it's too late to avoid confusion, but I'd suggest
> referring to the 64-bit object as a "PMU event filter entry," or
> something like that.
>

What about "filtered event"?

> > +}
> >
> > +static bool allowed_by_masked_events(struct kvm_pmu_event_filter *filter,
> > +                                    u64 eventsel)
> > +{
> > +       if (is_filtered(filter, eventsel, /*invert=*/false))
> > +               if (!is_filtered(filter, eventsel, /*invert=*/true))
>
> Perhaps you could eliminate the ugly parameter comments if you
> maintained the "normal" and inverted entries in separate lists. It
> might also speed things up for the common case, assuming that inverted
> entries are uncommon.

Is it really that ugly?  I thought it made it more clear, so you don't
have to jump to the function to see what the bool does.

I can see an argument for walking a shorter list for inverted entries
in the common case.

To do this I'll likely make an internal copy of the struct like
'struct kvm_x86_msr_filter' to avoid the flexible array at the end of
the pmu event filter, and to not mess with the current ABI.  I'll just
have two lists at the end: one for regular entries and one for
inverted entries.  If there's a better approach I'm all ears.

>
> > +                       return filter->action == KVM_PMU_EVENT_ALLOW;
> > +
> > +       return filter->action == KVM_PMU_EVENT_DENY;
> > +}
> > +
> > +static bool allowed_by_default_events(struct kvm_pmu_event_filter *filter,
> > +                                   u64 eventsel)
> > +{
> > +       u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
> > +
> > +       if (bsearch(&key, filter->events, filter->nevents,
> > +                   sizeof(u64), cmp_u64))
> > +               return filter->action == KVM_PMU_EVENT_ALLOW;
> > +
> > +       return filter->action == KVM_PMU_EVENT_DENY;
> > +}
> > +
> >  void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
> >  {
> >         u64 config;
> > @@ -226,14 +318,11 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
> >
> >         filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
> >         if (filter) {
> > -               __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
> > -
> > -               if (bsearch(&key, filter->events, filter->nevents,
> > -                           sizeof(__u64), cmp_u64))
> > -                       allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
> > -               else
> > -                       allow_event = filter->action == KVM_PMU_EVENT_DENY;
> > +               allow_event = (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) ?
> > +                       allowed_by_masked_events(filter, eventsel) :
> > +                       allowed_by_default_events(filter, eventsel);
>
> If you converted all of the legacy filters into masked filters by
> simply setting the mask field to '0xff' when copying from userspace,
> you wouldn't need the complexity of two different matching algorithms.

Agreed that it will simplify the code, but it will make the legacy
case slower because instead of being able to directly search for a
filtered event, we will have to walk the list of matching eventsel's
events looking for a match.


> > @@ -603,10 +706,18 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
> >         /* Ensure nevents can't be changed between the user copies. */
> >         *filter = tmp;
> >
> > +       r = -EINVAL;
> > +       /* To maintain backwards compatibility don't validate flags == 0. */
> > +       if (filter->flags != 0 && has_invalid_event(filter))
> > +               goto cleanup;
> > +
> > +       if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
> > +               cmp = cmp_eventsel_event;
> > +
> >         /*
> >          * Sort the in-kernel list so that we can search it with bsearch.
> >          */
> > -       sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
> > +       sort(&filter->events, filter->nevents, sizeof(u64), cmp, NULL);
>
> I don't believe two different comparison functions are necessary. In
> the legacy case, when setting up the filter, you should be able to
> simply discard any filter entries that have extraneous bits set,
> because they will never match.

For masked events we need the list ordered by eventsel's event, to
ensure they are contiguous.  For the legacy case we just need the list
sorted in a way that allows us to quickly find a matching eventsel's
event + unit mask.  This lends itself well to a generic u64 sort, but
by doing this the eventsel's events will not be contiguous, which
doesn't lend itself to searching for a masked event.  To get this to
work for both cases I think we'd have to rearrange the sort to put the
bits for the eventsel's event above the unit mask when doing the
compare, which would slow the sort and compare down with all the bit
twiddling going on.

Or if the legacy case adopted how masked events work and ate the extra
cost for a walk rather than a direct search we could join them.

>
> >         mutex_lock(&kvm->lock);
> >         filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
Sean Christopherson Aug. 4, 2022, 4:26 p.m. UTC | #4
On Sat, Jul 09, 2022, Aaron Lewis wrote:
> diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
> index fdfd8e06fee6..016713b583bf 100644
> --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
> @@ -24,6 +24,7 @@ KVM_X86_PMU_OP(set_msr)
>  KVM_X86_PMU_OP(refresh)
>  KVM_X86_PMU_OP(init)
>  KVM_X86_PMU_OP(reset)
> +KVM_X86_PMU_OP(get_event_mask)
>  KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
>  KVM_X86_PMU_OP_OPTIONAL(cleanup)
>  
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 21614807a2cb..2964f3f15fb5 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -522,6 +522,14 @@ struct kvm_pmu_event_filter {
>  #define KVM_PMU_EVENT_ALLOW 0
>  #define KVM_PMU_EVENT_DENY 1
>  
> +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS (1u << 0)

Is this a "flag" or a "type"?  The usage in code isn't really clear, e.g. there's
both 

	if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)

and

	if (flag == KVM_PMU_EVENT_FLAG_MASKED_EVENTS)

> +#define KVM_PMU_EVENT_ENCODE_MASKED_EVENT(select, mask, match, invert) \
> +		(((select) & 0xfful) | (((select) & 0xf00ul) << 24) | \
> +		(((mask) & 0xfful) << 24) | \
> +		(((match) & 0xfful) << 8) | \
> +		(((invert) & 0x1ul) << 23))

Please add a comment showing the actual layout, this is extremely dense to parse.
Alternatively, and probably my preference, would be to define this as a union.
Or maybe both?  E.g. so that userspace can do:

	filter[i].raw = KVM_PMU_EVENT_ENCODE_MASKED_EVENT(...);

	struct kvm_pmu_event_filter_entry {
		union {
			__u64 raw;
			struct {
				__u64 select_lo:8;
				__u64 match:8;
				__u64 rsvd1:7;
				__u64 invert:1;
				__u64 mask:8;
				__u64 select_hi:4;
				__u64 rsvd2:28;
			};
		}
	}

And that begs the question of whether or not KVM should check those "rsvd" fields.
IIUC, this layout directly correlates to hardware, and hardware defines many of the
"rsvd1" bits.  Are those just a don't care?

>  /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
>  #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
>  #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index 3f868fed9114..99c02bbb8f32 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -197,14 +197,106 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
>  	return true;
>  }
>  
> -static int cmp_u64(const void *pa, const void *pb)
> +static inline u64 get_event(u64 eventsel)
> +{
> +	return eventsel & AMD64_EVENTSEL_EVENT;
> +}
> +
> +static inline u8 get_unit_mask(u64 eventsel)
> +{
> +	return (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
> +}
> +
> +static inline u8 get_counter_mask(u64 eventsel)
>  {
> -	u64 a = *(u64 *)pa;
> -	u64 b = *(u64 *)pb;
> +	return (eventsel & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
> +}
> +
> +static inline bool get_invert_comparison(u64 eventsel)
> +{
> +	return !!(eventsel & ARCH_PERFMON_EVENTSEL_INV);
> +}
>  
> +static inline int cmp_safe64(u64 a, u64 b)
> +{
>  	return (a > b) - (a < b);
>  }
>  
> +static int cmp_eventsel_event(const void *pa, const void *pb)
> +{
> +	return cmp_safe64(*(u64 *)pa & AMD64_EVENTSEL_EVENT,
> +			  *(u64 *)pb & AMD64_EVENTSEL_EVENT);

Why is common x86 code reference AMD64 masks?  If "AMD64" here just means the
x86-64 / AMD64 architecture, i.e. is common to AMD and Intel, a comment to call
that out would be helpful.

> +}
> +
> +static int cmp_u64(const void *pa, const void *pb)
> +{
> +	return cmp_safe64(*(u64 *)pa,
> +			  *(u64 *)pb);
> +}
> +
> +static inline bool is_match(u64 masked_event, u64 eventsel)

Maybe "is_filter_entry_match"?

> +{
> +	u8 mask = get_counter_mask(masked_event);
> +	u8 match = get_unit_mask(masked_event);
> +	u8 unit_mask = get_unit_mask(eventsel);
> +
> +	return (unit_mask & mask) == match;
> +}
> +
> +static inline bool is_inverted(u64 masked_event)
> +{
> +	return get_invert_comparison(masked_event);
> +}

This is a pointless wrapper.  I suspect you added it to keep line lengths short,
but there are better ways to accomplish that.  More below.

> +
> +static bool is_filtered(struct kvm_pmu_event_filter *filter, u64 eventsel,
> +			bool invert)
> +{
> +	u64 key = get_event(eventsel);
> +	u64 *event, *evt;
> +
> +	event = bsearch(&key, filter->events, filter->nevents, sizeof(u64),
> +			cmp_eventsel_event);
> +
> +	if (event) {
> +		/* Walk the masked events backward looking for a match. */

This isn't a very helpful comment.  It's obvious enough from the code that this
walks backwards looking for a match, while the next one walks forward.  But it's
not immediately obvious _why_ that's the behavior.  I eventually realized the code
is looking for _any_ match, but a comment above this function documenting that
would would be very helpful.

And doesn't this approach yield somewhat arbitrary behavior?  If there are multiple
filters for 

> +		for (evt = event; evt >= filter->events &&
> +		     get_event(*evt) == get_event(eventsel); evt--)

The outer for-loop needs curly braces.  See "3) Placing Braces and Spaces" in
Documentation/process/coding-style.rst.

The "get_event() == get_event()" 
 
I have a strong dislike for arithmetic on pointers that aren't a single byte.  I
don't think it's entirely avoidable, but it could be contained (see below).,

> +			if (is_inverted(*evt) == invert && is_match(*evt, eventsel))

The is_inverted() helper can be avoided by handling the inversion check in
is_match().

			if (is_match(*event, eventsel, invert)

	if (entry) {
		pivot = <compute index of found entry>

		/*
		 * comment about how the key works and wanting to find any
		 * matching filter entry.
		 */
		for (i = pivot; i > 0; i--) {
			entry = &filter->events[i];
			if (get_event(*entry) != get_event(eventsel)
				break;
	
			if (is_match(*entry, eventsel, invert))
				return true;
		}

		for (i = pivot + 1; i < filter->nevents; i++) {
			entry = &filter->events[i];                             
                        if (get_event(*entry) != get_event(eventsel)            
                                break;                                          
                                                                                
                        if (is_match(*entry, eventsel, invert))                 
                                return true;  
		}
	}

> +				return true;
> +
> +		/* Walk the masked events forward looking for a match. */
> +		for (evt = event + 1;
> +		     evt < (filter->events + filter->nevents) &&
> +		     get_event(*evt) == get_event(eventsel); evt++)
> +			if (is_inverted(*evt) == invert && is_match(*evt, eventsel))
> +				return true;
> +	}
> +
> +	return false;
> +}
> +
> +static bool allowed_by_masked_events(struct kvm_pmu_event_filter *filter,
> +				     u64 eventsel)
> +{
> +	if (is_filtered(filter, eventsel, /*invert=*/false))

Outer if-statement needs curly braces.  But even better would be to do:

	if (is_filter(...) &&
	    !is_filter(...))
		return filter->action == KVM_PMU_EVENT_ALLOW;

That said, I have zero idea what the intended logic is, i.e. this needs a verbose
comment.   Maybe if I was actually familiar with PMU event magic it would be obvious,
but I won't be the last person that reads this code with only a passing undertsanding
of PMU events.

Specifically, having an inverted flag _and_ an inverted bit in the event mask is
confusing.

> +		if (!is_filtered(filter, eventsel, /*invert=*/true))
> +			return filter->action == KVM_PMU_EVENT_ALLOW;
> +
> +	return filter->action == KVM_PMU_EVENT_DENY;
> +}
> +
> +static bool allowed_by_default_events(struct kvm_pmu_event_filter *filter,
> +				    u64 eventsel)
> +{
> +	u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;

Again, a comment would be helpful.  What is a "default" event?  Why does that use
a "raw" mask as they key?  But the "default" part can be avoided entirely, see below.

> +
> +	if (bsearch(&key, filter->events, filter->nevents,
> +		    sizeof(u64), cmp_u64))

Let this poke out, it's only 82 chars.  Though I would say add a helper to do the
search.  That will also make it easier to use a unionized bitfield, e.g.

static __always_inline u64 *find_filter_entry(struct kvm_pmu_event_filter *filter,
					      u64 key, cmp_func_t cmp_fn)
{
	struct kvm_pmu_event_filter_entry *entry;

	entry = bsearch(&key, filter->events, filter->nevents,
			sizeof(filter->events[0]), cmp_fn);
	if (!entry)
		return NULL;

	return entry->raw;
}

> +		return filter->action == KVM_PMU_EVENT_ALLOW;
> +
> +	return filter->action == KVM_PMU_EVENT_DENY;
> +}
> +
>  void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
>  {
>  	u64 config;
> @@ -226,14 +318,11 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
>  
>  	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
>  	if (filter) {
> -		__u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
> -
> -		if (bsearch(&key, filter->events, filter->nevents,
> -			    sizeof(__u64), cmp_u64))
> -			allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
> -		else
> -			allow_event = filter->action == KVM_PMU_EVENT_DENY;
> +		allow_event = (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) ?
> +			allowed_by_masked_events(filter, eventsel) :
> +			allowed_by_default_events(filter, eventsel);

Using a ternary operator isn't buying much.  E.g. IMO this is more readable:

		if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
			allow_event = allowed_by_masked_events(filter, eventsel);
		else
			allow_event = allowed_by_default_events(filter, eventsel);

But again, better to avoid this entirely.

>  	}
> +
>  	if (!allow_event)

Checking "allow_event" outside of the "if (filter)" is unnecessary and confusing.

>  		return;

Rather than have separate helpers for "masked" versus "default", just have a single
"is_event_allowed"

static bool is_event_allowed(struct kvm_pmu_event_filter *filter, u64 eventsel)
{
	u64 key;

	if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
		if (is_filtered(filter, eventsel, false) &&
	    	    !is_filtered(filter, eventsel, true))
			return filter->action == KVM_PMU_EVENT_ALLOW;
		goto not_filtered;
	}

	key = eventsel & AMD64_RAW_EVENT_MASK_NB;
	if (find_filter_entry(filter, key, cmp_u64))
		return filter->action == KVM_PMU_EVENT_ALLOW;

not_filtered:
	return filter->action == KVM_PMU_EVENT_DENY;
}


And then this is succint and readable:

	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
	if (filter && !is_event_allowed(filter, eventsel))
		return;
Sean Christopherson Aug. 4, 2022, 4:29 p.m. UTC | #5
On Thu, Aug 04, 2022, Sean Christopherson wrote:
> On Sat, Jul 09, 2022, Aaron Lewis wrote:
> 	if (entry) {
> 		pivot = <compute index of found entry>
> 
> 		/*
> 		 * comment about how the key works and wanting to find any
> 		 * matching filter entry.
> 		 */
> 		for (i = pivot; i > 0; i--) {
> 			entry = &filter->events[i];
> 			if (get_event(*entry) != get_event(eventsel)

Just saw Jim's comment about using "key".  That would indeed make it easier to
understand what this is doing.

Note, the missing parantheses throughout aren't intentional :-)
Jim Mattson Aug. 4, 2022, 4:41 p.m. UTC | #6
On Wed, Aug 3, 2022 at 6:36 PM Aaron Lewis <aaronlewis@google.com> wrote:
>
> On Tue, Aug 2, 2022 at 5:19 PM Jim Mattson <jmattson@google.com> wrote:
> >
> > On Fri, Jul 8, 2022 at 6:17 PM Aaron Lewis <aaronlewis@google.com> wrote:
> > below. Maybe it's too late to avoid confusion, but I'd suggest
> > referring to the 64-bit object as a "PMU event filter entry," or
> > something like that.
> >
>
> What about "filtered event"?

Filter event?

> > >
> > > +static bool allowed_by_masked_events(struct kvm_pmu_event_filter *filter,
> > > +                                    u64 eventsel)
> > > +{
> > > +       if (is_filtered(filter, eventsel, /*invert=*/false))
> > > +               if (!is_filtered(filter, eventsel, /*invert=*/true))
> >
> > Perhaps you could eliminate the ugly parameter comments if you
> > maintained the "normal" and inverted entries in separate lists. It
> > might also speed things up for the common case, assuming that inverted
> > entries are uncommon.
>
> Is it really that ugly?  I thought it made it more clear, so you don't
> have to jump to the function to see what the bool does.
>
> I can see an argument for walking a shorter list for inverted entries
> in the common case.
>
> To do this I'll likely make an internal copy of the struct like
> 'struct kvm_x86_msr_filter' to avoid the flexible array at the end of
> the pmu event filter, and to not mess with the current ABI.  I'll just
> have two lists at the end: one for regular entries and one for
> inverted entries.  If there's a better approach I'm all ears.

Right. I'm not suggesting that you change the ABI. If you move the
'invert' bit to a higher bit position than any event select bit, then
by including the invert bit in your sort key, the sorted list will
naturally be partitioned into positive and negative entries.

> >
> > > +                       return filter->action == KVM_PMU_EVENT_ALLOW;
> > > +
> > > +       return filter->action == KVM_PMU_EVENT_DENY;
> > > +}
> > > +
> > > +static bool allowed_by_default_events(struct kvm_pmu_event_filter *filter,
> > > +                                   u64 eventsel)
> > > +{
> > > +       u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
> > > +
> > > +       if (bsearch(&key, filter->events, filter->nevents,
> > > +                   sizeof(u64), cmp_u64))
> > > +               return filter->action == KVM_PMU_EVENT_ALLOW;
> > > +
> > > +       return filter->action == KVM_PMU_EVENT_DENY;
> > > +}
> > > +
> > >  void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
> > >  {
> > >         u64 config;
> > > @@ -226,14 +318,11 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
> > >
> > >         filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
> > >         if (filter) {
> > > -               __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
> > > -
> > > -               if (bsearch(&key, filter->events, filter->nevents,
> > > -                           sizeof(__u64), cmp_u64))
> > > -                       allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
> > > -               else
> > > -                       allow_event = filter->action == KVM_PMU_EVENT_DENY;
> > > +               allow_event = (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) ?
> > > +                       allowed_by_masked_events(filter, eventsel) :
> > > +                       allowed_by_default_events(filter, eventsel);
> >
> > If you converted all of the legacy filters into masked filters by
> > simply setting the mask field to '0xff' when copying from userspace,
> > you wouldn't need the complexity of two different matching algorithms.
>
> Agreed that it will simplify the code, but it will make the legacy
> case slower because instead of being able to directly search for a
> filtered event, we will have to walk the list of matching eventsel's
> events looking for a match.

I think the simplicity of having one common abstraction outweighs the
performance penalty for legacy filter lists with a large number of
unit masks for a particular event select. However, I could be
convinced otherwise with a practical example.
Aaron Lewis Aug. 11, 2022, 12:08 a.m. UTC | #7
On Thu, Aug 4, 2022 at 4:26 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Sat, Jul 09, 2022, Aaron Lewis wrote:
> > diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
> > index fdfd8e06fee6..016713b583bf 100644
> > --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
> > +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
> > @@ -24,6 +24,7 @@ KVM_X86_PMU_OP(set_msr)
> >  KVM_X86_PMU_OP(refresh)
> >  KVM_X86_PMU_OP(init)
> >  KVM_X86_PMU_OP(reset)
> > +KVM_X86_PMU_OP(get_event_mask)
> >  KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
> >  KVM_X86_PMU_OP_OPTIONAL(cleanup)
> >
> > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> > index 21614807a2cb..2964f3f15fb5 100644
> > --- a/arch/x86/include/uapi/asm/kvm.h
> > +++ b/arch/x86/include/uapi/asm/kvm.h
> > @@ -522,6 +522,14 @@ struct kvm_pmu_event_filter {
> >  #define KVM_PMU_EVENT_ALLOW 0
> >  #define KVM_PMU_EVENT_DENY 1
> >
> > +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS (1u << 0)
>
> Is this a "flag" or a "type"?  The usage in code isn't really clear, e.g. there's
> both
>
>         if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
>
> and
>
>         if (flag == KVM_PMU_EVENT_FLAG_MASKED_EVENTS)

That should be "flag & KVM_PMU_EVENT_FLAG_MASKED_EVENTS".  I'll fix.

>
> > +#define KVM_PMU_EVENT_ENCODE_MASKED_EVENT(select, mask, match, invert) \
> > +             (((select) & 0xfful) | (((select) & 0xf00ul) << 24) | \
> > +             (((mask) & 0xfful) << 24) | \
> > +             (((match) & 0xfful) << 8) | \
> > +             (((invert) & 0x1ul) << 23))
>
> Please add a comment showing the actual layout, this is extremely dense to parse.
> Alternatively, and probably my preference, would be to define this as a union.
> Or maybe both?  E.g. so that userspace can do:
>
>         filter[i].raw = KVM_PMU_EVENT_ENCODE_MASKED_EVENT(...);
>
>         struct kvm_pmu_event_filter_entry {
>                 union {
>                         __u64 raw;
>                         struct {
>                                 __u64 select_lo:8;
>                                 __u64 match:8;
>                                 __u64 rsvd1:7;
>                                 __u64 invert:1;
>                                 __u64 mask:8;
>                                 __u64 select_hi:4;
>                                 __u64 rsvd2:28;
>                         };
>                 }
>         }

Agreed.  I'll add it.  Also, I like the idea of having both the union
and the helper.  They both seem useful.

>
> And that begs the question of whether or not KVM should check those "rsvd" fields.
> IIUC, this layout directly correlates to hardware, and hardware defines many of the
> "rsvd1" bits.  Are those just a don't care?

KVM really should care about the "rsvd" bits for filter events.  I
added a check in kvm_vm_ioctl_set_pmu_event_filter() for it and
documented it in the api docs.  If a "rsvd" bit is set, attempting to
set the filter will return -EINVAL.  This ensures that if we are on
Intel the 'select_hi' bits are clear.  This is important because it
allows us to use AMD's event select mask (AMD64_EVENTSEL_EVENT) in the
compare function when sorting or searching masked events on either
platform instead of having to special case the compare function.  I
also like validating the filter events so we have more predictability
from userspace's data when using it in the kernel.

I just noticed an issue: before doing the bsearch() the key needs to
apply the platform specific mask, i.e.
static_call(kvm_x86_pmu_get_event_mask)(), to ensure IN_TX (bit 32),
IN_TXCP (bit 33), and any other 'select_hi' bits Intel uses in the
future are clear.  I'll fix that in the follow-up.  I think we should
do that for both legacy and masked events, but I appreciate any
changes to the legacy mode will change the ABI, even if it is subtle,
so I can do that as an RFC.

> >
> > +static inline int cmp_safe64(u64 a, u64 b)
> > +{
> >       return (a > b) - (a < b);
> >  }
> >
> > +static int cmp_eventsel_event(const void *pa, const void *pb)
> > +{
> > +     return cmp_safe64(*(u64 *)pa & AMD64_EVENTSEL_EVENT,
> > +                       *(u64 *)pb & AMD64_EVENTSEL_EVENT);
>
> Why is common x86 code reference AMD64 masks?  If "AMD64" here just means the
> x86-64 / AMD64 architecture, i.e. is common to AMD and Intel, a comment to call
> that out would be helpful.

Explained above. I can add a comment.

>
> > +}
> > +
> > +static int cmp_u64(const void *pa, const void *pb)
> > +{
> > +     return cmp_safe64(*(u64 *)pa,
> > +                       *(u64 *)pb);
> > +}
> > +
> > +
> > +static bool is_filtered(struct kvm_pmu_event_filter *filter, u64 eventsel,
> > +                     bool invert)
> > +{
> > +     u64 key = get_event(eventsel);
> > +     u64 *event, *evt;
> > +
> > +     event = bsearch(&key, filter->events, filter->nevents, sizeof(u64),
> > +                     cmp_eventsel_event);
> > +
> > +     if (event) {
> > +             /* Walk the masked events backward looking for a match. */
>
> This isn't a very helpful comment.  It's obvious enough from the code that this
> walks backwards looking for a match, while the next one walks forward.  But it's
> not immediately obvious _why_ that's the behavior.  I eventually realized the code
> is looking for _any_ match, but a comment above this function documenting that
> would would be very helpful.
>
> And doesn't this approach yield somewhat arbitrary behavior?  If there are multiple
> filters for

I'm not sure I follow.  This should be deterministic.  An event is
either filtered or it's not, and while there are multiple cases that
would cause an event to not be filtered there is only one that will
cause it to be filtered.  That case is if a match exists in the filter
list while no corresponding inverted match exists.  Any other case
will not be filtered.  Those are: 1) if there is both a match and an
inverted match in the filter list. 2) there are no matching cases in
the filter list, inverted or non-inverted.  This second case is a
little more interesting because it can happen if no matches exist in
the filter list or if there is only an inverted match.  However,
either way we don't filter it, so it doesn't matter which one it is.

>
> > +             for (evt = event; evt >= filter->events &&
> > +                  get_event(*evt) == get_event(eventsel); evt--)
>
> > +static bool allowed_by_masked_events(struct kvm_pmu_event_filter *filter,
> > +                                  u64 eventsel)
> > +{
> > +     if (is_filtered(filter, eventsel, /*invert=*/false))
>
> Outer if-statement needs curly braces.  But even better would be to do:
>
>         if (is_filter(...) &&
>             !is_filter(...))
>                 return filter->action == KVM_PMU_EVENT_ALLOW;
>
> That said, I have zero idea what the intended logic is, i.e. this needs a verbose
> comment.   Maybe if I was actually familiar with PMU event magic it would be obvious,
> but I won't be the last person that reads this code with only a passing undertsanding
> of PMU events.
>
> Specifically, having an inverted flag _and_ an inverted bit in the event mask is
> confusing.

Maybe if I call it 'exclude' that would be better.  That does seem
like a more appropriate name.  I do think having this will be useful
at times, and in those moments it will allow for a more concise filter
list overall.  I'd really prefer to keep it.

Another option could be to remove KVM_PMU_EVENT_DENY.  I'm not sure
how useful it is.  Maybe there's a use case I'm not thinking of, but I
would think that the first step in building a deny list would be to
include all the undefined event selects + unit masks, which in legacy
mode would more than fill up the filter list.  That said, removing it
would be ABI breaking, so I thought I'd throw it out here.  Of course
there are other tricks to avoid breaking the ABI, but that would leave
the code a bit messier.  Thoughts?

>
> > +             if (!is_filtered(filter, eventsel, /*invert=*/true))
> > +                     return filter->action == KVM_PMU_EVENT_ALLOW;
> > +
> > +     return filter->action == KVM_PMU_EVENT_DENY;
> > +}
> > +
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 11e00a46c610..9316899880e8 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5017,7 +5017,13 @@  using this ioctl.
 :Architectures: x86
 :Type: vm ioctl
 :Parameters: struct kvm_pmu_event_filter (in)
-:Returns: 0 on success, -1 on error
+:Returns: 0 on success,
+    -EFAULT args[0] cannot be accessed.
+    -EINVAL args[0] contains invalid data in the filter or events field.
+                    Note: event validation is only done for modes where
+                    the flags field is non-zero.
+    -E2BIG nevents is too large.
+    -ENOMEM not enough memory to allocate the filter.
 
 ::
 
@@ -5030,14 +5036,48 @@  using this ioctl.
 	__u64 events[0];
   };
 
-This ioctl restricts the set of PMU events that the guest can program.
-The argument holds a list of events which will be allowed or denied.
-The eventsel+umask of each event the guest attempts to program is compared
-against the events field to determine whether the guest should have access.
+This ioctl restricts the set of PMU events the guest can program.  The
+argument holds a list of events which will be allowed or denied.
+
 The events field only controls general purpose counters; fixed purpose
 counters are controlled by the fixed_counter_bitmap.
 
-No flags are defined yet, the field must be zero.
+Valid values for 'flags'::
+
+``0``
+
+This is the default behavior for the pmu event filter, and used when the
+flags field is clear.  In this mode the eventsel+umask for the event the
+guest is attempting to program is compared against each event in the events
+field to determine whether the guest should have access to it.
+
+``KVM_PMU_EVENT_FLAG_MASKED_EVENTS``
+
+In this mode each event in the events field will be encoded with mask, match,
+and invert values in addition to an eventsel.  These encoded events will be
+matched against the event the guest is attempting to program to determine
+whether the guest should have access to it.  When matching a guest's event
+to the encoded events these steps are followed:
+ 1. Match the guest eventsel to the encoded eventsels.
+ 2. If a match is found, match the guest's unit mask to the mask and match
+    values of the encoded events that do not have the invert bit set
+    (ie: unit_mask & mask == match && !invert).
+ 3. If a match is found, match the guest's unit mask to the mask and match
+    values of the encoded events that have the invert bit set
+    (ie: unit_mask & mask == match && invert).
+ 4. If an inverted match is found, do not filter the event.
+ 5. If a match is found, but an inverted match is not, filter the event.
+ If the event is filtered and it's an allow list, allow the guest to program
+ the event.
+ If the event is filtered and it's a deny list, do not allow the guest to
+ program the event.
+
+To encode an event in the pmu_event_filter use
+KVM_PMU_EVENT_ENCODE_MASKED_EVENT().
+
+If a bit is set in an encoded event that is not a part of the bits used for
+eventsel, mask, match or invert a call to KVM_SET_PMU_EVENT_FILTER will
+return -EINVAL.
 
 Valid values for 'action'::
 
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
index fdfd8e06fee6..016713b583bf 100644
--- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -24,6 +24,7 @@  KVM_X86_PMU_OP(set_msr)
 KVM_X86_PMU_OP(refresh)
 KVM_X86_PMU_OP(init)
 KVM_X86_PMU_OP(reset)
+KVM_X86_PMU_OP(get_event_mask)
 KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
 KVM_X86_PMU_OP_OPTIONAL(cleanup)
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 21614807a2cb..2964f3f15fb5 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -522,6 +522,14 @@  struct kvm_pmu_event_filter {
 #define KVM_PMU_EVENT_ALLOW 0
 #define KVM_PMU_EVENT_DENY 1
 
+#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS (1u << 0)
+
+#define KVM_PMU_EVENT_ENCODE_MASKED_EVENT(select, mask, match, invert) \
+		(((select) & 0xfful) | (((select) & 0xf00ul) << 24) | \
+		(((mask) & 0xfful) << 24) | \
+		(((match) & 0xfful) << 8) | \
+		(((invert) & 0x1ul) << 23))
+
 /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
 #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
 #define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 3f868fed9114..99c02bbb8f32 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -197,14 +197,106 @@  static bool pmc_resume_counter(struct kvm_pmc *pmc)
 	return true;
 }
 
-static int cmp_u64(const void *pa, const void *pb)
+static inline u64 get_event(u64 eventsel)
+{
+	return eventsel & AMD64_EVENTSEL_EVENT;
+}
+
+static inline u8 get_unit_mask(u64 eventsel)
+{
+	return (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+}
+
+static inline u8 get_counter_mask(u64 eventsel)
 {
-	u64 a = *(u64 *)pa;
-	u64 b = *(u64 *)pb;
+	return (eventsel & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+}
+
+static inline bool get_invert_comparison(u64 eventsel)
+{
+	return !!(eventsel & ARCH_PERFMON_EVENTSEL_INV);
+}
 
+static inline int cmp_safe64(u64 a, u64 b)
+{
 	return (a > b) - (a < b);
 }
 
+static int cmp_eventsel_event(const void *pa, const void *pb)
+{
+	return cmp_safe64(*(u64 *)pa & AMD64_EVENTSEL_EVENT,
+			  *(u64 *)pb & AMD64_EVENTSEL_EVENT);
+}
+
+static int cmp_u64(const void *pa, const void *pb)
+{
+	return cmp_safe64(*(u64 *)pa,
+			  *(u64 *)pb);
+}
+
+static inline bool is_match(u64 masked_event, u64 eventsel)
+{
+	u8 mask = get_counter_mask(masked_event);
+	u8 match = get_unit_mask(masked_event);
+	u8 unit_mask = get_unit_mask(eventsel);
+
+	return (unit_mask & mask) == match;
+}
+
+static inline bool is_inverted(u64 masked_event)
+{
+	return get_invert_comparison(masked_event);
+}
+
+static bool is_filtered(struct kvm_pmu_event_filter *filter, u64 eventsel,
+			bool invert)
+{
+	u64 key = get_event(eventsel);
+	u64 *event, *evt;
+
+	event = bsearch(&key, filter->events, filter->nevents, sizeof(u64),
+			cmp_eventsel_event);
+
+	if (event) {
+		/* Walk the masked events backward looking for a match. */
+		for (evt = event; evt >= filter->events &&
+		     get_event(*evt) == get_event(eventsel); evt--)
+			if (is_inverted(*evt) == invert && is_match(*evt, eventsel))
+				return true;
+
+		/* Walk the masked events forward looking for a match. */
+		for (evt = event + 1;
+		     evt < (filter->events + filter->nevents) &&
+		     get_event(*evt) == get_event(eventsel); evt++)
+			if (is_inverted(*evt) == invert && is_match(*evt, eventsel))
+				return true;
+	}
+
+	return false;
+}
+
+static bool allowed_by_masked_events(struct kvm_pmu_event_filter *filter,
+				     u64 eventsel)
+{
+	if (is_filtered(filter, eventsel, /*invert=*/false))
+		if (!is_filtered(filter, eventsel, /*invert=*/true))
+			return filter->action == KVM_PMU_EVENT_ALLOW;
+
+	return filter->action == KVM_PMU_EVENT_DENY;
+}
+
+static bool allowed_by_default_events(struct kvm_pmu_event_filter *filter,
+				    u64 eventsel)
+{
+	u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
+
+	if (bsearch(&key, filter->events, filter->nevents,
+		    sizeof(u64), cmp_u64))
+		return filter->action == KVM_PMU_EVENT_ALLOW;
+
+	return filter->action == KVM_PMU_EVENT_DENY;
+}
+
 void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 {
 	u64 config;
@@ -226,14 +318,11 @@  void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 
 	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
 	if (filter) {
-		__u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
-
-		if (bsearch(&key, filter->events, filter->nevents,
-			    sizeof(__u64), cmp_u64))
-			allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
-		else
-			allow_event = filter->action == KVM_PMU_EVENT_DENY;
+		allow_event = (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) ?
+			allowed_by_masked_events(filter, eventsel) :
+			allowed_by_default_events(filter, eventsel);
 	}
+
 	if (!allow_event)
 		return;
 
@@ -572,8 +661,22 @@  void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
 }
 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
 
+static int has_invalid_event(struct kvm_pmu_event_filter *filter)
+{
+	u64 event_mask;
+	int i;
+
+	event_mask = static_call(kvm_x86_pmu_get_event_mask)(filter->flags);
+	for (i = 0; i < filter->nevents; i++)
+		if (filter->events[i] & ~event_mask)
+			return true;
+
+	return false;
+}
+
 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
 {
+	int (*cmp)(const void *a, const void *b) = cmp_u64;
 	struct kvm_pmu_event_filter tmp, *filter;
 	size_t size;
 	int r;
@@ -585,7 +688,7 @@  int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
 	    tmp.action != KVM_PMU_EVENT_DENY)
 		return -EINVAL;
 
-	if (tmp.flags != 0)
+	if (tmp.flags & ~KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
 		return -EINVAL;
 
 	if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
@@ -603,10 +706,18 @@  int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
 	/* Ensure nevents can't be changed between the user copies. */
 	*filter = tmp;
 
+	r = -EINVAL;
+	/* To maintain backwards compatibility don't validate flags == 0. */
+	if (filter->flags != 0 && has_invalid_event(filter))
+		goto cleanup;
+
+	if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+		cmp = cmp_eventsel_event;
+
 	/*
 	 * Sort the in-kernel list so that we can search it with bsearch.
 	 */
-	sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+	sort(&filter->events, filter->nevents, sizeof(u64), cmp, NULL);
 
 	mutex_lock(&kvm->lock);
 	filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index e745f443b6a8..f13fcc692d04 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -37,6 +37,7 @@  struct kvm_pmu_ops {
 	void (*reset)(struct kvm_vcpu *vcpu);
 	void (*deliver_pmi)(struct kvm_vcpu *vcpu);
 	void (*cleanup)(struct kvm_vcpu *vcpu);
+	u64 (*get_event_mask)(u32 flag);
 };
 
 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 136039fc6d01..41b7bd51fd11 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -342,6 +342,17 @@  static void amd_pmu_reset(struct kvm_vcpu *vcpu)
 	}
 }
 
+static u64 amd_pmu_get_event_mask(u32 flag)
+{
+	if (flag == KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+		return AMD64_EVENTSEL_EVENT |
+		       ARCH_PERFMON_EVENTSEL_UMASK |
+		       ARCH_PERFMON_EVENTSEL_INV |
+		       ARCH_PERFMON_EVENTSEL_CMASK;
+	return AMD64_EVENTSEL_EVENT |
+	       ARCH_PERFMON_EVENTSEL_UMASK;
+}
+
 struct kvm_pmu_ops amd_pmu_ops __initdata = {
 	.pmc_perf_hw_id = amd_pmc_perf_hw_id,
 	.pmc_is_enabled = amd_pmc_is_enabled,
@@ -355,4 +366,5 @@  struct kvm_pmu_ops amd_pmu_ops __initdata = {
 	.refresh = amd_pmu_refresh,
 	.init = amd_pmu_init,
 	.reset = amd_pmu_reset,
+	.get_event_mask = amd_pmu_get_event_mask,
 };
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 37e9eb32e3d9..27c44105760d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -719,6 +719,17 @@  static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
 		intel_pmu_release_guest_lbr_event(vcpu);
 }
 
+static u64 intel_pmu_get_event_mask(u32 flag)
+{
+	if (flag == KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+		return ARCH_PERFMON_EVENTSEL_EVENT |
+		       ARCH_PERFMON_EVENTSEL_UMASK |
+		       ARCH_PERFMON_EVENTSEL_INV |
+		       ARCH_PERFMON_EVENTSEL_CMASK;
+	return ARCH_PERFMON_EVENTSEL_EVENT |
+	       ARCH_PERFMON_EVENTSEL_UMASK;
+}
+
 struct kvm_pmu_ops intel_pmu_ops __initdata = {
 	.pmc_perf_hw_id = intel_pmc_perf_hw_id,
 	.pmc_is_enabled = intel_pmc_is_enabled,
@@ -734,4 +745,5 @@  struct kvm_pmu_ops intel_pmu_ops __initdata = {
 	.reset = intel_pmu_reset,
 	.deliver_pmi = intel_pmu_deliver_pmi,
 	.cleanup = intel_pmu_cleanup,
+	.get_event_mask = intel_pmu_get_event_mask,
 };