diff mbox series

[RFC,net-next] net/sched: act_police: add support for packet-per-second policing

Message ID 20210125151819.8313-1-simon.horman@netronome.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series [RFC,net-next] net/sched: act_police: add support for packet-per-second policing | expand

Commit Message

Simon Horman Jan. 25, 2021, 3:18 p.m. UTC
From: Baowen Zheng <baowen.zheng@corigine.com>

Allow a policer action to enforce a rate-limit based on packets-per-second,
configurable using a packet-per-second rate and burst parameters. This may
be used in conjunction with existing byte-per-second rate limiting in the
same policer action.

e.g.
tc filter add dev tap1 parent ffff: u32 match \
              u32 0 0 police pkts_rate 3000 pkts_burst 1000

Testing was unable to uncover a performance impact of this change on
existing features.

Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Louis Peens <louis.peens@netronome.com>
---
 include/net/sch_generic.h      | 15 ++++++++++++++
 include/net/tc_act/tc_police.h |  4 ++++
 include/uapi/linux/pkt_cls.h   |  2 ++
 net/sched/act_police.c         | 37 +++++++++++++++++++++++++++++++---
 net/sched/sch_generic.c        | 32 +++++++++++++++++++++++++++++
 5 files changed, 87 insertions(+), 3 deletions(-)

Comments

Jakub Kicinski Jan. 27, 2021, 2:38 a.m. UTC | #1
On Mon, 25 Jan 2021 16:18:19 +0100 Simon Horman wrote:
> From: Baowen Zheng <baowen.zheng@corigine.com>
> 
> Allow a policer action to enforce a rate-limit based on packets-per-second,
> configurable using a packet-per-second rate and burst parameters. This may
> be used in conjunction with existing byte-per-second rate limiting in the
> same policer action.
> 
> e.g.
> tc filter add dev tap1 parent ffff: u32 match \
>               u32 0 0 police pkts_rate 3000 pkts_burst 1000
> 
> Testing was unable to uncover a performance impact of this change on
> existing features.
> 
> Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
> Signed-off-by: Simon Horman <simon.horman@netronome.com>
> Signed-off-by: Louis Peens <louis.peens@netronome.com>

> diff --git a/net/sched/act_police.c b/net/sched/act_police.c
> index 8d8452b1cdd4..d700b2105535 100644
> --- a/net/sched/act_police.c
> +++ b/net/sched/act_police.c
> @@ -42,6 +42,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
>  	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
>  	[TCA_POLICE_RATE64]     = { .type = NLA_U64 },
>  	[TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
> +	[TCA_POLICE_PKTRATE64]  = { .type = NLA_U64 },
> +	[TCA_POLICE_PKTBURST64] = { .type = NLA_U64 },

Should we set the policy so that .min = 1?

>  };
>  
>  static int tcf_police_init(struct net *net, struct nlattr *nla,
> @@ -61,6 +63,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
>  	bool exists = false;
>  	u32 index;
>  	u64 rate64, prate64;
> +	u64 pps, ppsburst;
>  
>  	if (nla == NULL)
>  		return -EINVAL;
> @@ -183,6 +186,16 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
>  	if (tb[TCA_POLICE_AVRATE])
>  		new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
>  
> +	if (tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64]) {

Should we reject if only one is present?

> +		pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]);
> +		ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]);
> +		if (pps) {
> +			new->pps_present = true;
> +			new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst);
> +			psched_ppscfg_precompute(&new->ppsrate, pps);
> +		}
> +	}
> +
>  	spin_lock_bh(&police->tcf_lock);
>  	spin_lock_bh(&police->tcfp_lock);
>  	police->tcfp_t_c = ktime_get_ns();

> +void psched_ppscfg_precompute(struct psched_pktrate *r,
> +			      u64 pktrate64)
> +{
> +	memset(r, 0, sizeof(*r));
> +	r->rate_pkts_ps = pktrate64;
> +	r->mult = 1;
> +	/* The deal here is to replace a divide by a reciprocal one
> +	 * in fast path (a reciprocal divide is a multiply and a shift)
> +	 *
> +	 * Normal formula would be :
> +	 *  time_in_ns = (NSEC_PER_SEC * pkt_num) / pktrate64
> +	 *
> +	 * We compute mult/shift to use instead :
> +	 *  time_in_ns = (len * mult) >> shift;
> +	 *
> +	 * We try to get the highest possible mult value for accuracy,
> +	 * but have to make sure no overflows will ever happen.
> +	 */
> +	if (r->rate_pkts_ps > 0) {
> +		u64 factor = NSEC_PER_SEC;
> +
> +		for (;;) {
> +			r->mult = div64_u64(factor, r->rate_pkts_ps);
> +			if (r->mult & (1U << 31) || factor & (1ULL << 63))
> +				break;
> +			factor <<= 1;
> +			r->shift++;

Aren't there helpers somewhere for the reciprocal divide
pre-calculation?

> +		}
> +	}
> +}
> +EXPORT_SYMBOL(psched_ppscfg_precompute);
Simon Horman Jan. 27, 2021, 11:02 a.m. UTC | #2
Hi Jakub,

On Tue, Jan 26, 2021 at 06:38:12PM -0800, Jakub Kicinski wrote:
> On Mon, 25 Jan 2021 16:18:19 +0100 Simon Horman wrote:
> > From: Baowen Zheng <baowen.zheng@corigine.com>
> > 
> > Allow a policer action to enforce a rate-limit based on packets-per-second,
> > configurable using a packet-per-second rate and burst parameters. This may
> > be used in conjunction with existing byte-per-second rate limiting in the
> > same policer action.
> > 
> > e.g.
> > tc filter add dev tap1 parent ffff: u32 match \
> >               u32 0 0 police pkts_rate 3000 pkts_burst 1000
> > 
> > Testing was unable to uncover a performance impact of this change on
> > existing features.
> > 
> > Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
> > Signed-off-by: Simon Horman <simon.horman@netronome.com>
> > Signed-off-by: Louis Peens <louis.peens@netronome.com>
> 
> > diff --git a/net/sched/act_police.c b/net/sched/act_police.c
> > index 8d8452b1cdd4..d700b2105535 100644
> > --- a/net/sched/act_police.c
> > +++ b/net/sched/act_police.c
> > @@ -42,6 +42,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
> >  	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
> >  	[TCA_POLICE_RATE64]     = { .type = NLA_U64 },
> >  	[TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
> > +	[TCA_POLICE_PKTRATE64]  = { .type = NLA_U64 },
> > +	[TCA_POLICE_PKTBURST64] = { .type = NLA_U64 },
> 
> Should we set the policy so that .min = 1?

Yes, I think so.
Thanks for spotting that.

> >  };
> >  
> >  static int tcf_police_init(struct net *net, struct nlattr *nla,
> > @@ -61,6 +63,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
> >  	bool exists = false;
> >  	u32 index;
> >  	u64 rate64, prate64;
> > +	u64 pps, ppsburst;
> >  
> >  	if (nla == NULL)
> >  		return -EINVAL;
> > @@ -183,6 +186,16 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
> >  	if (tb[TCA_POLICE_AVRATE])
> >  		new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
> >  
> > +	if (tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64]) {
> 
> Should we reject if only one is present?

Again, yes I think so.
I'll confirm this with the author too.

> > +		pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]);
> > +		ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]);
> > +		if (pps) {
> > +			new->pps_present = true;
> > +			new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst);
> > +			psched_ppscfg_precompute(&new->ppsrate, pps);
> > +		}
> > +	}
> > +
> >  	spin_lock_bh(&police->tcf_lock);
> >  	spin_lock_bh(&police->tcfp_lock);
> >  	police->tcfp_t_c = ktime_get_ns();
> 
> > +void psched_ppscfg_precompute(struct psched_pktrate *r,
> > +			      u64 pktrate64)
> > +{
> > +	memset(r, 0, sizeof(*r));
> > +	r->rate_pkts_ps = pktrate64;
> > +	r->mult = 1;
> > +	/* The deal here is to replace a divide by a reciprocal one
> > +	 * in fast path (a reciprocal divide is a multiply and a shift)
> > +	 *
> > +	 * Normal formula would be :
> > +	 *  time_in_ns = (NSEC_PER_SEC * pkt_num) / pktrate64
> > +	 *
> > +	 * We compute mult/shift to use instead :
> > +	 *  time_in_ns = (len * mult) >> shift;
> > +	 *
> > +	 * We try to get the highest possible mult value for accuracy,
> > +	 * but have to make sure no overflows will ever happen.
> > +	 */
> > +	if (r->rate_pkts_ps > 0) {
> > +		u64 factor = NSEC_PER_SEC;
> > +
> > +		for (;;) {
> > +			r->mult = div64_u64(factor, r->rate_pkts_ps);
> > +			if (r->mult & (1U << 31) || factor & (1ULL << 63))
> > +				break;
> > +			factor <<= 1;
> > +			r->shift++;
> 
> Aren't there helpers somewhere for the reciprocal divide
> pre-calculation?

Now that you mention it, yes.

Looking over reciprocal_divide() I don't think it a good fit here as it
operates on 32bit values, whereas the packet rate is 64 bit.

Packet rate could be changed to a 32 bit entity if we convince ourselves we
don't want more than 2^32 - 1 packets per second (a plausible position
IMHO) - but that leads us to a secondary issue.

The code above is very similar to an existing (long existing)
byte rate variant of this helper - psched_ratecfg_precompute().
And I do think we want to:
a) Support 64-bit byte rates. Indeed such support seems to have
   been added to support 25G use-cases
b) Calculate byte and packet rates the same way

So I feel less and less that reciprocal_divide() is a good fit.
But perhaps I am mistaken.

In the meantime I will take a look to see if a helper common function can
be made to do (64 bit) reciprocal divides for the packet and byte rate
use-cases.  I.e. the common code in psched_ppscfg_precompute() and
psched_ratecfg_precompute().

> > +		}
> > +	}
> > +}
> > +EXPORT_SYMBOL(psched_ppscfg_precompute);
> 
>
Jakub Kicinski Jan. 27, 2021, 8:59 p.m. UTC | #3
On Wed, 27 Jan 2021 12:02:23 +0100 Simon Horman wrote:
> > > +void psched_ppscfg_precompute(struct psched_pktrate *r,
> > > +			      u64 pktrate64)
> > > +{
> > > +	memset(r, 0, sizeof(*r));
> > > +	r->rate_pkts_ps = pktrate64;
> > > +	r->mult = 1;
> > > +	/* The deal here is to replace a divide by a reciprocal one
> > > +	 * in fast path (a reciprocal divide is a multiply and a shift)
> > > +	 *
> > > +	 * Normal formula would be :
> > > +	 *  time_in_ns = (NSEC_PER_SEC * pkt_num) / pktrate64
> > > +	 *
> > > +	 * We compute mult/shift to use instead :
> > > +	 *  time_in_ns = (len * mult) >> shift;
> > > +	 *
> > > +	 * We try to get the highest possible mult value for accuracy,
> > > +	 * but have to make sure no overflows will ever happen.
> > > +	 */
> > > +	if (r->rate_pkts_ps > 0) {
> > > +		u64 factor = NSEC_PER_SEC;
> > > +
> > > +		for (;;) {
> > > +			r->mult = div64_u64(factor, r->rate_pkts_ps);
> > > +			if (r->mult & (1U << 31) || factor & (1ULL << 63))
> > > +				break;
> > > +			factor <<= 1;
> > > +			r->shift++;  
> > 
> > Aren't there helpers somewhere for the reciprocal divide
> > pre-calculation?  
> 
> Now that you mention it, yes.
> 
> Looking over reciprocal_divide() I don't think it a good fit here as it
> operates on 32bit values, whereas the packet rate is 64 bit.
> 
> Packet rate could be changed to a 32 bit entity if we convince ourselves we
> don't want more than 2^32 - 1 packets per second (a plausible position
> IMHO) - but that leads us to a secondary issue.
> 
> The code above is very similar to an existing (long existing)
> byte rate variant of this helper - psched_ratecfg_precompute().
> And I do think we want to:
> a) Support 64-bit byte rates. Indeed such support seems to have
>    been added to support 25G use-cases
> b) Calculate byte and packet rates the same way
> 
> So I feel less and less that reciprocal_divide() is a good fit.
> But perhaps I am mistaken.
> 
> In the meantime I will take a look to see if a helper common function can
> be made to do (64 bit) reciprocal divides for the packet and byte rate
> use-cases.  I.e. the common code in psched_ppscfg_precompute() and
> psched_ratecfg_precompute().

No strong feelings, I'll just ask to document the reasoning in the
commit message or the comment above.
Simon Horman Jan. 28, 2021, 12:02 p.m. UTC | #4
On Wed, Jan 27, 2021 at 12:59:05PM -0800, Jakub Kicinski wrote:
> On Wed, 27 Jan 2021 12:02:23 +0100 Simon Horman wrote:
> > > > +void psched_ppscfg_precompute(struct psched_pktrate *r,
> > > > +			      u64 pktrate64)
> > > > +{
> > > > +	memset(r, 0, sizeof(*r));
> > > > +	r->rate_pkts_ps = pktrate64;
> > > > +	r->mult = 1;
> > > > +	/* The deal here is to replace a divide by a reciprocal one
> > > > +	 * in fast path (a reciprocal divide is a multiply and a shift)
> > > > +	 *
> > > > +	 * Normal formula would be :
> > > > +	 *  time_in_ns = (NSEC_PER_SEC * pkt_num) / pktrate64
> > > > +	 *
> > > > +	 * We compute mult/shift to use instead :
> > > > +	 *  time_in_ns = (len * mult) >> shift;
> > > > +	 *
> > > > +	 * We try to get the highest possible mult value for accuracy,
> > > > +	 * but have to make sure no overflows will ever happen.
> > > > +	 */
> > > > +	if (r->rate_pkts_ps > 0) {
> > > > +		u64 factor = NSEC_PER_SEC;
> > > > +
> > > > +		for (;;) {
> > > > +			r->mult = div64_u64(factor, r->rate_pkts_ps);
> > > > +			if (r->mult & (1U << 31) || factor & (1ULL << 63))
> > > > +				break;
> > > > +			factor <<= 1;
> > > > +			r->shift++;  
> > > 
> > > Aren't there helpers somewhere for the reciprocal divide
> > > pre-calculation?  
> > 
> > Now that you mention it, yes.
> > 
> > Looking over reciprocal_divide() I don't think it a good fit here as it
> > operates on 32bit values, whereas the packet rate is 64 bit.
> > 
> > Packet rate could be changed to a 32 bit entity if we convince ourselves we
> > don't want more than 2^32 - 1 packets per second (a plausible position
> > IMHO) - but that leads us to a secondary issue.
> > 
> > The code above is very similar to an existing (long existing)
> > byte rate variant of this helper - psched_ratecfg_precompute().
> > And I do think we want to:
> > a) Support 64-bit byte rates. Indeed such support seems to have
> >    been added to support 25G use-cases
> > b) Calculate byte and packet rates the same way
> > 
> > So I feel less and less that reciprocal_divide() is a good fit.
> > But perhaps I am mistaken.
> > 
> > In the meantime I will take a look to see if a helper common function can
> > be made to do (64 bit) reciprocal divides for the packet and byte rate
> > use-cases.  I.e. the common code in psched_ppscfg_precompute() and
> > psched_ratecfg_precompute().
> 
> No strong feelings, I'll just ask to document the reasoning in the
> commit message or the comment above.

Thanks. I'll include some text explaining this when reposting.
Simon Horman Jan. 28, 2021, 12:08 p.m. UTC | #5
On Wed, Jan 27, 2021 at 12:02:23PM +0100, Simon Horman wrote:
> Hi Jakub,
> 
> On Tue, Jan 26, 2021 at 06:38:12PM -0800, Jakub Kicinski wrote:
> > On Mon, 25 Jan 2021 16:18:19 +0100 Simon Horman wrote:
> > > From: Baowen Zheng <baowen.zheng@corigine.com>
> > > 
> > > Allow a policer action to enforce a rate-limit based on packets-per-second,
> > > configurable using a packet-per-second rate and burst parameters. This may
> > > be used in conjunction with existing byte-per-second rate limiting in the
> > > same policer action.
> > > 
> > > e.g.
> > > tc filter add dev tap1 parent ffff: u32 match \
> > >               u32 0 0 police pkts_rate 3000 pkts_burst 1000
> > > 
> > > Testing was unable to uncover a performance impact of this change on
> > > existing features.
> > > 
> > > Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
> > > Signed-off-by: Simon Horman <simon.horman@netronome.com>
> > > Signed-off-by: Louis Peens <louis.peens@netronome.com>
> > 
> > > diff --git a/net/sched/act_police.c b/net/sched/act_police.c
> > > index 8d8452b1cdd4..d700b2105535 100644
> > > --- a/net/sched/act_police.c
> > > +++ b/net/sched/act_police.c
> > > @@ -42,6 +42,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
> > >  	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
> > >  	[TCA_POLICE_RATE64]     = { .type = NLA_U64 },
> > >  	[TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
> > > +	[TCA_POLICE_PKTRATE64]  = { .type = NLA_U64 },
> > > +	[TCA_POLICE_PKTBURST64] = { .type = NLA_U64 },
> > 
> > Should we set the policy so that .min = 1?
> 
> Yes, I think so.
> Thanks for spotting that.

It seems that I was mistaken.

A value of 0 is used to clear packet-per-second rate limiting
while leaving bit-per-second rate configuration in place for a
policer action.

So I think the policy should be left as is...

> > >  };
> > >  
> > >  static int tcf_police_init(struct net *net, struct nlattr *nla,
> > > @@ -61,6 +63,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
> > >  	bool exists = false;
> > >  	u32 index;
> > >  	u64 rate64, prate64;
> > > +	u64 pps, ppsburst;
> > >  
> > >  	if (nla == NULL)
> > >  		return -EINVAL;
> > > @@ -183,6 +186,16 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
> > >  	if (tb[TCA_POLICE_AVRATE])
> > >  		new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
> > >  
> > > +	if (tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64]) {
> > 
> > Should we reject if only one is present?
> 
> Again, yes I think so.
> I'll confirm this with the author too.

... but add this restriction so the code will require either:

1. Both PKTRATE64 and PKTBURST64 are non-zero: packet-per-second limit is set;
   or
2. Both PKTRATE64 and PKTBURST64 are zero: packet-per-second limit is cleared

...
Ido Schimmel Jan. 28, 2021, 4:19 p.m. UTC | #6
On Mon, Jan 25, 2021 at 04:18:19PM +0100, Simon Horman wrote:
> From: Baowen Zheng <baowen.zheng@corigine.com>
> 
> Allow a policer action to enforce a rate-limit based on packets-per-second,
> configurable using a packet-per-second rate and burst parameters. This may
> be used in conjunction with existing byte-per-second rate limiting in the
> same policer action.

Hi Simon,

Any reason to allow metering based on both packets and bytes at the same
action versus adding a mode (packets / bytes) parameter? You can then
chain two policers if you need to rate limit based on both. Something
like:

# tc filter add dev tap1 ingress pref 1 matchall \
	action police rate 1000Mbit burst 128k conform-exceed drop/pipe \
	action police pkts_rate 3000 pkts_burst 1000

I'm asking because the policers in the Spectrum ASIC are built that way
and I also don't remember seeing such a mixed mode online.

> 
> e.g.
> tc filter add dev tap1 parent ffff: u32 match \
>               u32 0 0 police pkts_rate 3000 pkts_burst 1000
> 
> Testing was unable to uncover a performance impact of this change on
> existing features.
> 
> Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
> Signed-off-by: Simon Horman <simon.horman@netronome.com>
> Signed-off-by: Louis Peens <louis.peens@netronome.com>
> ---
>  include/net/sch_generic.h      | 15 ++++++++++++++
>  include/net/tc_act/tc_police.h |  4 ++++
>  include/uapi/linux/pkt_cls.h   |  2 ++
>  net/sched/act_police.c         | 37 +++++++++++++++++++++++++++++++---
>  net/sched/sch_generic.c        | 32 +++++++++++++++++++++++++++++
>  5 files changed, 87 insertions(+), 3 deletions(-)

The intermediate representation in include/net/flow_offload.h needs to
carry the new configuration so that drivers will be able to veto
unsupported configuration.
Simon Horman Feb. 1, 2021, 12:31 p.m. UTC | #7
On Thu, Jan 28, 2021 at 06:19:33PM +0200, Ido Schimmel wrote:
> On Mon, Jan 25, 2021 at 04:18:19PM +0100, Simon Horman wrote:
> > From: Baowen Zheng <baowen.zheng@corigine.com>
> > 
> > Allow a policer action to enforce a rate-limit based on packets-per-second,
> > configurable using a packet-per-second rate and burst parameters. This may
> > be used in conjunction with existing byte-per-second rate limiting in the
> > same policer action.
> 
> Hi Simon,
> 
> Any reason to allow metering based on both packets and bytes at the same
> action versus adding a mode (packets / bytes) parameter? You can then
> chain two policers if you need to rate limit based on both. Something
> like:
> 
> # tc filter add dev tap1 ingress pref 1 matchall \
> 	action police rate 1000Mbit burst 128k conform-exceed drop/pipe \
> 	action police pkts_rate 3000 pkts_burst 1000
> 
> I'm asking because the policers in the Spectrum ASIC are built that way
> and I also don't remember seeing such a mixed mode online.

Hi Ido,

sorry for missing this email until you pointed it out to me in another
thread.

We did consider this question during development and our conclusion was
that it was useful as we do have use-cases which call for both to be used
and it seems nice to allow lower layers to determine the order in which the
actions are applied to satisfied the user's more general request for both -
it should be no surprise that we plan to provide a hardware offload of this
feature. It also seems to offer nice code re-use. We did also try to
examine the performance impact of this change on existing use-cases and it
appeared to be negligible/within noise of our measurements.

> > e.g.
> > tc filter add dev tap1 parent ffff: u32 match \
> >               u32 0 0 police pkts_rate 3000 pkts_burst 1000
> > 
> > Testing was unable to uncover a performance impact of this change on
> > existing features.
> > 
> > Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
> > Signed-off-by: Simon Horman <simon.horman@netronome.com>
> > Signed-off-by: Louis Peens <louis.peens@netronome.com>
> > ---
> >  include/net/sch_generic.h      | 15 ++++++++++++++
> >  include/net/tc_act/tc_police.h |  4 ++++
> >  include/uapi/linux/pkt_cls.h   |  2 ++
> >  net/sched/act_police.c         | 37 +++++++++++++++++++++++++++++++---
> >  net/sched/sch_generic.c        | 32 +++++++++++++++++++++++++++++
> >  5 files changed, 87 insertions(+), 3 deletions(-)
> 
> The intermediate representation in include/net/flow_offload.h needs to
> carry the new configuration so that drivers will be able to veto
> unsupported configuration.
Simon Horman Feb. 1, 2021, 12:38 p.m. UTC | #8
On Mon, Feb 01, 2021 at 01:31:17PM +0100, Simon Horman wrote:
> On Thu, Jan 28, 2021 at 06:19:33PM +0200, Ido Schimmel wrote:
> > On Mon, Jan 25, 2021 at 04:18:19PM +0100, Simon Horman wrote:
> > > From: Baowen Zheng <baowen.zheng@corigine.com>
> > > 
> > > Allow a policer action to enforce a rate-limit based on packets-per-second,
> > > configurable using a packet-per-second rate and burst parameters. This may
> > > be used in conjunction with existing byte-per-second rate limiting in the
> > > same policer action.
> > 
> > Hi Simon,
> > 
> > Any reason to allow metering based on both packets and bytes at the same
> > action versus adding a mode (packets / bytes) parameter? You can then
> > chain two policers if you need to rate limit based on both. Something
> > like:
> > 
> > # tc filter add dev tap1 ingress pref 1 matchall \
> > 	action police rate 1000Mbit burst 128k conform-exceed drop/pipe \
> > 	action police pkts_rate 3000 pkts_burst 1000
> > 
> > I'm asking because the policers in the Spectrum ASIC are built that way
> > and I also don't remember seeing such a mixed mode online.
> 
> Hi Ido,
> 
> sorry for missing this email until you pointed it out to me in another
> thread.
> 
> We did consider this question during development and our conclusion was
> that it was useful as we do have use-cases which call for both to be used
> and it seems nice to allow lower layers to determine the order in which the
> actions are applied to satisfied the user's more general request for both -
> it should be no surprise that we plan to provide a hardware offload of this
> feature. It also seems to offer nice code re-use. We did also try to
> examine the performance impact of this change on existing use-cases and it
> appeared to be negligible/within noise of our measurements.
> 
> > > e.g.
> > > tc filter add dev tap1 parent ffff: u32 match \
> > >               u32 0 0 police pkts_rate 3000 pkts_burst 1000
> > > 
> > > Testing was unable to uncover a performance impact of this change on
> > > existing features.
> > > 
> > > Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
> > > Signed-off-by: Simon Horman <simon.horman@netronome.com>
> > > Signed-off-by: Louis Peens <louis.peens@netronome.com>
> > > ---
> > >  include/net/sch_generic.h      | 15 ++++++++++++++
> > >  include/net/tc_act/tc_police.h |  4 ++++
> > >  include/uapi/linux/pkt_cls.h   |  2 ++
> > >  net/sched/act_police.c         | 37 +++++++++++++++++++++++++++++++---
> > >  net/sched/sch_generic.c        | 32 +++++++++++++++++++++++++++++
> > >  5 files changed, 87 insertions(+), 3 deletions(-)
> > 
> > The intermediate representation in include/net/flow_offload.h needs to
> > carry the new configuration so that drivers will be able to veto
> > unsupported configuration.

Thanks for pointing that out. I do have a patch for that but was planning
to post it as a follow-up to keep this series simple. I do see your point
that the flow_offload.h change is a dependency of this patch. I'll include
it when reposting.
Ido Schimmel Feb. 1, 2021, 5:41 p.m. UTC | #9
On Mon, Feb 01, 2021 at 01:31:17PM +0100, Simon Horman wrote:
> On Thu, Jan 28, 2021 at 06:19:33PM +0200, Ido Schimmel wrote:
> > On Mon, Jan 25, 2021 at 04:18:19PM +0100, Simon Horman wrote:
> > > From: Baowen Zheng <baowen.zheng@corigine.com>
> > > 
> > > Allow a policer action to enforce a rate-limit based on packets-per-second,
> > > configurable using a packet-per-second rate and burst parameters. This may
> > > be used in conjunction with existing byte-per-second rate limiting in the
> > > same policer action.
> > 
> > Hi Simon,
> > 
> > Any reason to allow metering based on both packets and bytes at the same
> > action versus adding a mode (packets / bytes) parameter? You can then
> > chain two policers if you need to rate limit based on both. Something
> > like:
> > 
> > # tc filter add dev tap1 ingress pref 1 matchall \
> > 	action police rate 1000Mbit burst 128k conform-exceed drop/pipe \
> > 	action police pkts_rate 3000 pkts_burst 1000
> > 
> > I'm asking because the policers in the Spectrum ASIC are built that way
> > and I also don't remember seeing such a mixed mode online.
> 
> Hi Ido,
> 
> sorry for missing this email until you pointed it out to me in another
> thread.

Hi,

No problem. There were (are?) some issues with netdev mails lately.

> 
> We did consider this question during development and our conclusion was
> that it was useful as we do have use-cases which call for both to be used
> and it seems nice to allow lower layers to determine the order in which the
> actions are applied to satisfied the user's more general request for both -

The lower layer need to respect whatever is implemented in the software
data path, but with this approach it is not clear which limit is imposed
first. One needs to check act_police's code for that. With the more
discrete approach (two actions), user is in complete control.

There is also an issue of visibility into how many packets were dropped
due to which limit. With this approach both drops are squashed to the
same counter. In the hardware offload case, I assume this would entail
reading the drop counters of two different policers which might not be
atomic (at least on Spectrum).

> it should be no surprise that we plan to provide a hardware offload of this
> feature. 

Sure. I assumed this was the intention. With Netronome hardware, would
such an action be translated into two actions / two policers?

> It also seems to offer nice code re-use.

Yes, the diff is nice, but I do not think it would be much worse if rate
and bandwidth limiting were made to be mutually exclusive.

> We did also try to examine the performance impact of this change on
> existing use-cases and it appeared to be negligible/within noise of
> our measurements.
> 
> > > e.g.
> > > tc filter add dev tap1 parent ffff: u32 match \
> > >               u32 0 0 police pkts_rate 3000 pkts_burst 1000
> > > 
> > > Testing was unable to uncover a performance impact of this change on
> > > existing features.
> > > 
> > > Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
> > > Signed-off-by: Simon Horman <simon.horman@netronome.com>
> > > Signed-off-by: Louis Peens <louis.peens@netronome.com>
> > > ---
> > >  include/net/sch_generic.h      | 15 ++++++++++++++
> > >  include/net/tc_act/tc_police.h |  4 ++++
> > >  include/uapi/linux/pkt_cls.h   |  2 ++
> > >  net/sched/act_police.c         | 37 +++++++++++++++++++++++++++++++---
> > >  net/sched/sch_generic.c        | 32 +++++++++++++++++++++++++++++
> > >  5 files changed, 87 insertions(+), 3 deletions(-)
> > 
> > The intermediate representation in include/net/flow_offload.h needs to
> > carry the new configuration so that drivers will be able to veto
> > unsupported configuration.
diff mbox series

Patch

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 639e465a108f..1a93ad634fee 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1234,6 +1234,21 @@  static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
 	res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
 }
 
+struct psched_pktrate {
+	u64	rate_pkts_ps; /* packets per second */
+	u32	mult;
+	u8	shift;
+};
+
+static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r,
+				  unsigned int pkt_num)
+{
+	return ((u64)pkt_num * r->mult) >> r->shift;
+}
+
+void psched_ppscfg_precompute(struct psched_pktrate *r,
+			      u64 pktrate64);
+
 /* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
  * The fast path only needs to access filter list and to update stats
  */
diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h
index 6d1e26b709b5..b106460a8d66 100644
--- a/include/net/tc_act/tc_police.h
+++ b/include/net/tc_act/tc_police.h
@@ -10,10 +10,13 @@  struct tcf_police_params {
 	s64			tcfp_burst;
 	u32			tcfp_mtu;
 	s64			tcfp_mtu_ptoks;
+	s64			tcfp_pkt_burst;
 	struct psched_ratecfg	rate;
 	bool			rate_present;
 	struct psched_ratecfg	peak;
 	bool			peak_present;
+	struct psched_pktrate	ppsrate;
+	bool			pps_present;
 	struct rcu_head rcu;
 };
 
@@ -24,6 +27,7 @@  struct tcf_police {
 	spinlock_t		tcfp_lock ____cacheline_aligned_in_smp;
 	s64			tcfp_toks;
 	s64			tcfp_ptoks;
+	s64			tcfp_pkttoks;
 	s64			tcfp_t_c;
 };
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index ee95f42fb0ec..bffa54e73785 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -190,6 +190,8 @@  enum {
 	TCA_POLICE_PAD,
 	TCA_POLICE_RATE64,
 	TCA_POLICE_PEAKRATE64,
+	TCA_POLICE_PKTRATE64,
+	TCA_POLICE_PKTBURST64,
 	__TCA_POLICE_MAX
 #define TCA_POLICE_RESULT TCA_POLICE_RESULT
 };
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 8d8452b1cdd4..d700b2105535 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -42,6 +42,8 @@  static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
 	[TCA_POLICE_RATE64]     = { .type = NLA_U64 },
 	[TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
+	[TCA_POLICE_PKTRATE64]  = { .type = NLA_U64 },
+	[TCA_POLICE_PKTBURST64] = { .type = NLA_U64 },
 };
 
 static int tcf_police_init(struct net *net, struct nlattr *nla,
@@ -61,6 +63,7 @@  static int tcf_police_init(struct net *net, struct nlattr *nla,
 	bool exists = false;
 	u32 index;
 	u64 rate64, prate64;
+	u64 pps, ppsburst;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -183,6 +186,16 @@  static int tcf_police_init(struct net *net, struct nlattr *nla,
 	if (tb[TCA_POLICE_AVRATE])
 		new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
 
+	if (tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64]) {
+		pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]);
+		ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]);
+		if (pps) {
+			new->pps_present = true;
+			new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst);
+			psched_ppscfg_precompute(&new->ppsrate, pps);
+		}
+	}
+
 	spin_lock_bh(&police->tcf_lock);
 	spin_lock_bh(&police->tcfp_lock);
 	police->tcfp_t_c = ktime_get_ns();
@@ -217,8 +230,8 @@  static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_police *police = to_police(a);
+	s64 now, toks, ppstoks = 0, ptoks = 0;
 	struct tcf_police_params *p;
-	s64 now, toks, ptoks = 0;
 	int ret;
 
 	tcf_lastuse_update(&police->tcf_tm);
@@ -236,7 +249,7 @@  static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	if (qdisc_pkt_len(skb) <= p->tcfp_mtu) {
-		if (!p->rate_present) {
+		if (!p->rate_present && !p->pps_present) {
 			ret = p->tcfp_result;
 			goto end;
 		}
@@ -251,14 +264,22 @@  static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 			ptoks -= (s64)psched_l2t_ns(&p->peak,
 						    qdisc_pkt_len(skb));
 		}
+		if (p->pps_present) {
+			ppstoks = min_t(s64, now - police->tcfp_t_c, p->tcfp_pkt_burst);
+			ppstoks += police->tcfp_pkttoks;
+			if (ppstoks > p->tcfp_pkt_burst)
+				ppstoks = p->tcfp_pkt_burst;
+			ppstoks -= (s64)psched_pkt2t_ns(&p->ppsrate, 1);
+		}
 		toks += police->tcfp_toks;
 		if (toks > p->tcfp_burst)
 			toks = p->tcfp_burst;
 		toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
-		if ((toks|ptoks) >= 0) {
+		if ((toks | ptoks | ppstoks) >= 0) {
 			police->tcfp_t_c = now;
 			police->tcfp_toks = toks;
 			police->tcfp_ptoks = ptoks;
+			police->tcfp_pkttoks = ppstoks;
 			spin_unlock_bh(&police->tcfp_lock);
 			ret = p->tcfp_result;
 			goto inc_drops;
@@ -331,6 +352,16 @@  static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
 				      TCA_POLICE_PAD))
 			goto nla_put_failure;
 	}
+	if (p->pps_present) {
+		if (nla_put_u64_64bit(skb, TCA_POLICE_PKTRATE64,
+				      police->params->ppsrate.rate_pkts_ps,
+				      TCA_POLICE_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_POLICE_PKTBURST64,
+				      PSCHED_NS2TICKS(p->tcfp_pkt_burst),
+				      TCA_POLICE_PAD))
+			goto nla_put_failure;
+	}
 	if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
 		goto nla_put_failure;
 	if (p->tcfp_result &&
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 49eae93d1489..209e34ee4bd3 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1361,6 +1361,38 @@  void psched_ratecfg_precompute(struct psched_ratecfg *r,
 }
 EXPORT_SYMBOL(psched_ratecfg_precompute);
 
+void psched_ppscfg_precompute(struct psched_pktrate *r,
+			      u64 pktrate64)
+{
+	memset(r, 0, sizeof(*r));
+	r->rate_pkts_ps = pktrate64;
+	r->mult = 1;
+	/* The deal here is to replace a divide by a reciprocal one
+	 * in fast path (a reciprocal divide is a multiply and a shift)
+	 *
+	 * Normal formula would be :
+	 *  time_in_ns = (NSEC_PER_SEC * pkt_num) / pktrate64
+	 *
+	 * We compute mult/shift to use instead :
+	 *  time_in_ns = (len * mult) >> shift;
+	 *
+	 * We try to get the highest possible mult value for accuracy,
+	 * but have to make sure no overflows will ever happen.
+	 */
+	if (r->rate_pkts_ps > 0) {
+		u64 factor = NSEC_PER_SEC;
+
+		for (;;) {
+			r->mult = div64_u64(factor, r->rate_pkts_ps);
+			if (r->mult & (1U << 31) || factor & (1ULL << 63))
+				break;
+			factor <<= 1;
+			r->shift++;
+		}
+	}
+}
+EXPORT_SYMBOL(psched_ppscfg_precompute);
+
 static void mini_qdisc_rcu_func(struct rcu_head *head)
 {
 }