Message ID | 20220829114739.GA2436@debian (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net-next: frags: add adaptive per-peer timeout under load | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Guessing tree name failed - patch did not apply |
On Mon, Aug 29, 2022 at 4:49 AM Richard Gobert <richardbgobert@gmail.com> wrote: > > Calculate a dynamic fragment reassembly timeout, taking into > consideration the current fqdir load and the load introduced by > the peer. Reintroduce low_thresh, which now acts as a knob for > adjusting per-peer memory limits. > > Signed-off-by: Richard Gobert <richardbgobert@gmail.com> > --- > Documentation/networking/ip-sysctl.rst | 3 +++ > include/net/inet_frag.h | 1 + > net/ipv4/inet_fragment.c | 30 +++++++++++++++++++++++++- > net/ipv4/ip_fragment.c | 2 +- > 4 files changed, 34 insertions(+), 2 deletions(-) > > diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst > index 56cd4ea059b2..fb25aa6e22a2 100644 > --- a/Documentation/networking/ip-sysctl.rst > +++ b/Documentation/networking/ip-sysctl.rst > @@ -247,6 +247,9 @@ ipfrag_low_thresh - LONG INTEGER > begins to remove incomplete fragment queues to free up resources. > The kernel still accepts new fragments for defragmentation. > > + (Since linux-6.1) > + Maximum memory used to reassemble IP fragments sent by a single peer. > + > ipfrag_time - INTEGER > Time in seconds to keep an IP fragment in memory. > > diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h > index 077a0ec78a58..595a6db57a0e 100644 > --- a/include/net/inet_frag.h > +++ b/include/net/inet_frag.h > @@ -99,6 +99,7 @@ struct inet_frag_queue { > u16 max_size; > struct fqdir *fqdir; > struct inet_peer *peer; > + u64 timeout; Why u64 ? This is not what the timer interface uses (look at mod_timer(), it uses "unsigned long") > struct rcu_head rcu; > }; > > diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c > index 8b8d77d548d4..34c5ebba4951 100644 > --- a/net/ipv4/inet_fragment.c > +++ b/net/ipv4/inet_fragment.c > @@ -314,6 +314,30 @@ void inet_frag_free(struct inet_frag_queue *q) > call_rcu(&q->rcu, inet_frag_destroy_rcu); > } > > +static int inet_frag_update_timeout(struct inet_frag_queue *q) > +{ > + u64 peer_timeout, inet_timeout; > + long peer_mem, inet_mem; > + long high_thresh = READ_ONCE(q->fqdir->high_thresh); > + long low_thresh = READ_ONCE(q->fqdir->low_thresh); > + u64 base_timeout = READ_ONCE(q->fqdir->timeout); > + > + peer_mem = low_thresh - peer_mem_limit(q); > + inet_mem = high_thresh - frag_mem_limit(q->fqdir); > + > + if (peer_mem <= 0 || inet_mem <= 0) > + return -ENOMEM; > + > + /* Timeout changes linearly with respect to the amount of free memory. > + * Choose the more permissive of the two timeouts, to avoid limiting > + * the system while there is still enough memory. > + */ > + peer_timeout = div64_long(base_timeout * peer_mem, low_thresh); > + inet_timeout = div64_long(base_timeout * inet_mem, high_thresh); > + q->timeout = max_t(u64, peer_timeout, inet_timeout); If/when under load, timeout is close to zero, we would fire many timers (increased system load) and make impossible for datagrams to complete. In contrast, a reasonable timer and probabilistic drops of new datagrams when the queue is full lets some datagrams to complete. Make sure to test your change under a real DDOS, not only non malicious netperf > + return 0; > +} > + > void inet_frag_destroy(struct inet_frag_queue *q) > { > struct fqdir *fqdir; > @@ -346,6 +370,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, > > q->fqdir = fqdir; > f->constructor(q, arg); > + if (inet_frag_update_timeout(q)) { > + inet_frag_free(q); > + return NULL; > + } > add_frag_mem_limit(q, f->qsize); > > timer_setup(&q->timer, f->frag_expire, 0); > @@ -367,7 +395,7 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, > *prev = ERR_PTR(-ENOMEM); > return NULL; > } > - mod_timer(&q->timer, jiffies + fqdir->timeout); > + mod_timer(&q->timer, jiffies + q->timeout); > > *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, > &q->node, f->rhash_params); > diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c > index e35061f6aadb..88a99242d721 100644 > --- a/net/ipv4/ip_fragment.c > +++ b/net/ipv4/ip_fragment.c > @@ -236,7 +236,7 @@ static int ip_frag_reinit(struct ipq *qp) > { > unsigned int sum_truesize = 0; > > - if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { > + if (!mod_timer(&qp->q.timer, jiffies + qp->q.timeout)) { > refcount_inc(&qp->q.refcnt); > return -ETIMEDOUT; > } > -- > 2.36.1 >
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 56cd4ea059b2..fb25aa6e22a2 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -247,6 +247,9 @@ ipfrag_low_thresh - LONG INTEGER begins to remove incomplete fragment queues to free up resources. The kernel still accepts new fragments for defragmentation. + (Since linux-6.1) + Maximum memory used to reassemble IP fragments sent by a single peer. + ipfrag_time - INTEGER Time in seconds to keep an IP fragment in memory. diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 077a0ec78a58..595a6db57a0e 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -99,6 +99,7 @@ struct inet_frag_queue { u16 max_size; struct fqdir *fqdir; struct inet_peer *peer; + u64 timeout; struct rcu_head rcu; }; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 8b8d77d548d4..34c5ebba4951 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -314,6 +314,30 @@ void inet_frag_free(struct inet_frag_queue *q) call_rcu(&q->rcu, inet_frag_destroy_rcu); } +static int inet_frag_update_timeout(struct inet_frag_queue *q) +{ + u64 peer_timeout, inet_timeout; + long peer_mem, inet_mem; + long high_thresh = READ_ONCE(q->fqdir->high_thresh); + long low_thresh = READ_ONCE(q->fqdir->low_thresh); + u64 base_timeout = READ_ONCE(q->fqdir->timeout); + + peer_mem = low_thresh - peer_mem_limit(q); + inet_mem = high_thresh - frag_mem_limit(q->fqdir); + + if (peer_mem <= 0 || inet_mem <= 0) + return -ENOMEM; + + /* Timeout changes linearly with respect to the amount of free memory. + * Choose the more permissive of the two timeouts, to avoid limiting + * the system while there is still enough memory. + */ + peer_timeout = div64_long(base_timeout * peer_mem, low_thresh); + inet_timeout = div64_long(base_timeout * inet_mem, high_thresh); + q->timeout = max_t(u64, peer_timeout, inet_timeout); + return 0; +} + void inet_frag_destroy(struct inet_frag_queue *q) { struct fqdir *fqdir; @@ -346,6 +370,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, q->fqdir = fqdir; f->constructor(q, arg); + if (inet_frag_update_timeout(q)) { + inet_frag_free(q); + return NULL; + } add_frag_mem_limit(q, f->qsize); timer_setup(&q->timer, f->frag_expire, 0); @@ -367,7 +395,7 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, *prev = ERR_PTR(-ENOMEM); return NULL; } - mod_timer(&q->timer, jiffies + fqdir->timeout); + mod_timer(&q->timer, jiffies + q->timeout); *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e35061f6aadb..88a99242d721 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -236,7 +236,7 @@ static int ip_frag_reinit(struct ipq *qp) { unsigned int sum_truesize = 0; - if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.timeout)) { refcount_inc(&qp->q.refcnt); return -ETIMEDOUT; }
Calculate a dynamic fragment reassembly timeout, taking into consideration the current fqdir load and the load introduced by the peer. Reintroduce low_thresh, which now acts as a knob for adjusting per-peer memory limits. Signed-off-by: Richard Gobert <richardbgobert@gmail.com> --- Documentation/networking/ip-sysctl.rst | 3 +++ include/net/inet_frag.h | 1 + net/ipv4/inet_fragment.c | 30 +++++++++++++++++++++++++- net/ipv4/ip_fragment.c | 2 +- 4 files changed, 34 insertions(+), 2 deletions(-)