diff mbox series

[net-next] tcp: update window_clamp together with scaling_ratio

Message ID 20240402215405.432863-1-hli@netflix.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] tcp: update window_clamp together with scaling_ratio | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 946 this patch: 946
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: dsahern@kernel.org
netdev/build_clang success Errors and warnings before: 957 this patch: 957
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 957 this patch: 957
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 22 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-04-03--09-00 (tests: 950)

Commit Message

Hechao Li April 2, 2024, 9:54 p.m. UTC
After commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"),
we noticed an application-level timeout due to reduced throughput. This
can be reproduced by the following minimal client and server program.

server:

int main(int argc, char *argv[]) {
    int sockfd;
    char buffer[256];
    struct sockaddr_in srv_addr;

    // Create socket
    sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
    if (sockfd < 0) {
       perror("server: socket()");
       return -1;
    }
    bzero((char *) &srv_addr, sizeof(srv_addr));
    srv_addr.sin_family = AF_INET;
    srv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
    srv_addr.sin_port = htons(8080);
    // Bind socket
    if (bind(sockfd, (struct sockaddr *) &srv_addr,
	     sizeof(srv_addr)) < 0)  {
        perror("server: bind()");
        close(sockfd);
        return -1;
    }
    // Listen for connections
    listen(sockfd,5);

    while(1) {
        int filefd = -1, newsockfd = -1;
        struct sockaddr_in cli_addr;
        socklen_t cli_len = sizeof(cli_addr);

        // Accept connection
        newsockfd = accept(sockfd, (struct sockaddr *)&cli_addr, &cli_len);
        if (newsockfd < 0) {
            perror("server: accept()");
            goto end;
        }
        // Read filename from client
        bzero(buffer, sizeof(buffer));
        ssize_t n = read(newsockfd,buffer,sizeof(buffer)-1);
        if (n < 0) {
            perror("server: read()");
            goto end;
        }
        // Open file
        filefd = open(buffer, O_RDONLY);
        if (filefd < 0) {
            perror("server: read()");
            goto end;
        }
        // Get file size
        struct stat file_stat;
        if(fstat(filefd, &file_stat) < 0) {
            perror("server: fstat()");
            goto end;
        }
        // Send file
        off_t offset = 0;
        ssize_t bytes_sent = 0, bytes_left = file_stat.st_size;
        while ((bytes_sent = sendfile(newsockfd, filefd,
				      &offset, bytes_left)) > 0) {
            bytes_left -= bytes_sent;
        }

end:
        // Close file and client socket
        if (filefd > 0) {
                close(filefd);
        }
        if (newsockfd > 0) {
                close(newsockfd);
        }
    }
    close(sockfd);
    return 0;
}

client:

int main(int argc, char *argv[]) {
    int sockfd, filefd;
    char *server_addr = argv[1];
    char *filename = argv[2];
    struct sockaddr_in sockaddr;
    char buffer[256];
    ssize_t n;

    if ((sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) {
        perror("client: socket()");
        return -1;
    }

    sockaddr.sin_family = AF_INET;
    inet_pton(AF_INET, server_addr, &sockaddr.sin_addr);
    sockaddr.sin_port = htons(8080);

    int val = 65536;
    if (setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF,
		   &val, sizeof(val)) < 0) {
        perror("client: setockopt(SO_RCVBUF)");
        return -1;
    }
    if (connect(sockfd, (struct sockaddr*)&sockaddr,
		sizeof(sockaddr)) == -1) {
        close(sockfd);
        perror("client: connect()");
        return -1;
    }

    // Send filename to server
    n = write(sockfd, filename, strlen(filename));
    if (n < 0) {
         perror("client: write()");
         return -1;
    }
    // Open file
    filefd = open(filename, O_WRONLY | O_CREAT, 0666);
    if(filefd < 0) {
         perror("client: open()");
         return -1;
    }
    // Read file from server
    while((n = read(sockfd, buffer, sizeof(buffer))) > 0) {
        write(filefd, buffer, n);
    }
    // Close file and socket
    close(filefd);
    close(sockfd);
    return 0;
}

Before the commit, it takes around 22 seconds to transfer 10M data.
After the commit, it takes 40 seconds. Because our application has a
30-second timeout, this regression broke the application.

The reason that it takes longer to transfer data is that
tp->scaling_ratio is initialized to a value that results in ~0.25 of
rcvbuf. In our case, SO_RCVBUF is set to 65536 by the application, which
translates to 2 * 65536 = 131,072 bytes in rcvbuf and hence a ~28k
initial receive window.

Later, even though the scaling_ratio is updated to a more accurate
skb->len/skb->truesize, which is ~0.66 in our environment, the window
stays at ~0.25 * rcvbuf. This is because tp->window_clamp does not
change together with the tp->scaling_ratio update. As a result, the
window size is capped at the initial window_clamp, which is also ~0.25 *
rcvbuf, and never grows bigger.

This patch updates window_clamp along with scaling_ratio. It changes the
calculation of the initial rcv_wscale as well to make sure the scale
factor is also not capped by the initial window_clamp.

A comment from Tycho Andersen <tycho@tycho.pizza> is "What happens if
someone has done setsockopt(sk, TCP_WINDOW_CLAMP) explicitly; will this
and the above not violate userspace's desire to clamp the window size?".
This comment is not addressed in this patch because the existing code
also updates window_clamp at several places without checking if
TCP_WINDOW_CLAMP is set by user space. Adding this check now may break
certain user space assumption (similar to how the original patch broke
the assumption of buffer overhead being 50%). For example, if a user
space program sets TCP_WINDOW_CLAMP but the applicaiton behavior relies
on window_clamp adjusted by the kernel as of today.

Fixes: dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale")
Signed-off-by: Hechao Li <hli@netflix.com>
Reviewed-by: Tycho Andersen <tycho@tycho.pizza>
---
 net/ipv4/tcp_input.c  | 6 +++++-
 net/ipv4/tcp_output.c | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

Comments

Jakub Kicinski April 3, 2024, 2:13 p.m. UTC | #1
On Tue,  2 Apr 2024 14:54:06 -0700 Hechao Li wrote:
> After commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"),
> we noticed an application-level timeout due to reduced throughput. This
> can be reproduced by the following minimal client and server program.

Hi Hechao, nice to e-meet you :)

I suspect Eric may say that SO_RCVBUF = 64k is not very reasonable.
But I'll leave the technical review to him.

What I noticed is that our cryptic CI system appears to point at this
change as breaking BPF tests:

https://netdev.bots.linux.dev/flakes.html?min-flip=0&pw-n=0&tn-needle=gh-bpf

We accumulate all outstanding patches and test together.
BPF broke at net-next-2024-04-03--00-00, and:

$ cidiff origin/net-next-2024-04-02--21-00 \
         origin/net-next-2024-04-03--00-00

+tcp: update window_clamp together with scaling_ratio
+tools: ynl: ethtool.py: Output timestamping statistics from tsinfo-get operation
+netlink: specs: ethtool: add header-flags enumeration
+net/mlx5e: Implement ethtool hardware timestamping statistics
+net/mlx5e: Introduce timestamps statistic counter for Tx DMA layer
+net/mlx5e: Introduce lost_cqe statistic counter for PTP Tx port timestamping CQ
+ethtool: add interface to read Tx hardware timestamping statistics

The other patches are all driver stuff..

Here's the BPF CI output:

https://github.com/kernel-patches/bpf/actions/runs/8538300303
Eric Dumazet April 3, 2024, 2:22 p.m. UTC | #2
On Tue, Apr 2, 2024 at 11:56 PM Hechao Li <hli@netflix.com> wrote:
>
> After commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"),
> we noticed an application-level timeout due to reduced throughput. This
> can be reproduced by the following minimal client and server program.
>
> server:
>
...
>
> Before the commit, it takes around 22 seconds to transfer 10M data.
> After the commit, it takes 40 seconds. Because our application has a
> 30-second timeout, this regression broke the application.
>
> The reason that it takes longer to transfer data is that
> tp->scaling_ratio is initialized to a value that results in ~0.25 of
> rcvbuf. In our case, SO_RCVBUF is set to 65536 by the application, which
> translates to 2 * 65536 = 131,072 bytes in rcvbuf and hence a ~28k
> initial receive window.

What driver are you using, what MTU is set ?

If you get a 0.25 ratio, that is because a driver is oversizing rx skbs.

SO_RCVBUF 65536 would map indeed to 32768 bytes of payload.

>
> Later, even though the scaling_ratio is updated to a more accurate
> skb->len/skb->truesize, which is ~0.66 in our environment, the window
> stays at ~0.25 * rcvbuf. This is because tp->window_clamp does not
> change together with the tp->scaling_ratio update. As a result, the
> window size is capped at the initial window_clamp, which is also ~0.25 *
> rcvbuf, and never grows bigger.
>
> This patch updates window_clamp along with scaling_ratio. It changes the
> calculation of the initial rcv_wscale as well to make sure the scale
> factor is also not capped by the initial window_clamp.

This is very suspicious.

>
> A comment from Tycho Andersen <tycho@tycho.pizza> is "What happens if
> someone has done setsockopt(sk, TCP_WINDOW_CLAMP) explicitly; will this
> and the above not violate userspace's desire to clamp the window size?".
> This comment is not addressed in this patch because the existing code
> also updates window_clamp at several places without checking if
> TCP_WINDOW_CLAMP is set by user space. Adding this check now may break
> certain user space assumption (similar to how the original patch broke
> the assumption of buffer overhead being 50%). For example, if a user
> space program sets TCP_WINDOW_CLAMP but the applicaiton behavior relies
> on window_clamp adjusted by the kernel as of today.

Quite frankly I would prefer we increase tcp_rmem[] sysctls, instead
of trying to accomodate
with too small SO_RCVBUF values.

This would benefit old applications that were written 20 years ago.
Eric Dumazet April 3, 2024, 2:49 p.m. UTC | #3
On Wed, Apr 3, 2024 at 4:22 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Tue, Apr 2, 2024 at 11:56 PM Hechao Li <hli@netflix.com> wrote:
> >
> > After commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"),
> > we noticed an application-level timeout due to reduced throughput. This
> > can be reproduced by the following minimal client and server program.
> >
> > server:
> >
> ...
> >
> > Before the commit, it takes around 22 seconds to transfer 10M data.
> > After the commit, it takes 40 seconds. Because our application has a
> > 30-second timeout, this regression broke the application.
> >
> > The reason that it takes longer to transfer data is that
> > tp->scaling_ratio is initialized to a value that results in ~0.25 of
> > rcvbuf. In our case, SO_RCVBUF is set to 65536 by the application, which
> > translates to 2 * 65536 = 131,072 bytes in rcvbuf and hence a ~28k
> > initial receive window.
>
> What driver are you using, what MTU is set ?
>
> If you get a 0.25 ratio, that is because a driver is oversizing rx skbs.
>
> SO_RCVBUF 65536 would map indeed to 32768 bytes of payload.
>
> >
> > Later, even though the scaling_ratio is updated to a more accurate
> > skb->len/skb->truesize, which is ~0.66 in our environment, the window
> > stays at ~0.25 * rcvbuf. This is because tp->window_clamp does not
> > change together with the tp->scaling_ratio update. As a result, the
> > window size is capped at the initial window_clamp, which is also ~0.25 *
> > rcvbuf, and never grows bigger.

Sorry I missed this part. I understand better.

I wonder if we should at least test (sk->sk_userlocks &
SOCK_RCVBUF_LOCK) or something...

For autotuned flows (majority of the cases), tp->window_clamp is
changed from tcp_rcv_space_adjust()

I think we need to audit a bit more all tp->window_clamp changes.

> >
> > This patch updates window_clamp along with scaling_ratio. It changes the
> > calculation of the initial rcv_wscale as well to make sure the scale
> > factor is also not capped by the initial window_clamp.
>
> This is very suspicious.
>
> >
> > A comment from Tycho Andersen <tycho@tycho.pizza> is "What happens if
> > someone has done setsockopt(sk, TCP_WINDOW_CLAMP) explicitly; will this
> > and the above not violate userspace's desire to clamp the window size?".
> > This comment is not addressed in this patch because the existing code
> > also updates window_clamp at several places without checking if
> > TCP_WINDOW_CLAMP is set by user space. Adding this check now may break
> > certain user space assumption (similar to how the original patch broke
> > the assumption of buffer overhead being 50%). For example, if a user
> > space program sets TCP_WINDOW_CLAMP but the applicaiton behavior relies
> > on window_clamp adjusted by the kernel as of today.
>
> Quite frankly I would prefer we increase tcp_rmem[] sysctls, instead
> of trying to accomodate
> with too small SO_RCVBUF values.
>
> This would benefit old applications that were written 20 years ago.
Hechao Li April 3, 2024, 4:30 p.m. UTC | #4
On 24/04/03 04:49PM, Eric Dumazet wrote:
> On Wed, Apr 3, 2024 at 4:22 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Tue, Apr 2, 2024 at 11:56 PM Hechao Li <hli@netflix.com> wrote:
> > >
> > > After commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"),
> > > we noticed an application-level timeout due to reduced throughput. This
> > > can be reproduced by the following minimal client and server program.
> > >
> > > server:
> > >
> > ...
> > >
> > > Before the commit, it takes around 22 seconds to transfer 10M data.
> > > After the commit, it takes 40 seconds. Because our application has a
> > > 30-second timeout, this regression broke the application.
> > >
> > > The reason that it takes longer to transfer data is that
> > > tp->scaling_ratio is initialized to a value that results in ~0.25 of
> > > rcvbuf. In our case, SO_RCVBUF is set to 65536 by the application, which
> > > translates to 2 * 65536 = 131,072 bytes in rcvbuf and hence a ~28k
> > > initial receive window.
> >
> > What driver are you using, what MTU is set ?

The driver is AWS ENA driver. This is cross-region/internet traffic, so
the MTU is 1500.

> >
> > If you get a 0.25 ratio, that is because a driver is oversizing rx skbs.
> >
> > SO_RCVBUF 65536 would map indeed to 32768 bytes of payload.
> >

The 0.25 ratio is the initial default ratio calculated using 

#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
                                   SKB_TRUESIZE(4096))

I think this is a constant 0.25, no?

Later with skb->len/skb->truesize, we get 0.66. However, the window
can't grow to this ratio because window_clamp stays at the initial
value, which is the initial tcp_full_space(sk), which is roughly 0.25 *
rcvbuf.

> > >
> > > Later, even though the scaling_ratio is updated to a more accurate
> > > skb->len/skb->truesize, which is ~0.66 in our environment, the window
> > > stays at ~0.25 * rcvbuf. This is because tp->window_clamp does not
> > > change together with the tp->scaling_ratio update. As a result, the
> > > window size is capped at the initial window_clamp, which is also ~0.25 *
> > > rcvbuf, and never grows bigger.
> 
> Sorry I missed this part. I understand better.
> 
> I wonder if we should at least test (sk->sk_userlocks &
> SOCK_RCVBUF_LOCK) or something...

In our case, the application does set SOCK_RCVBUF_LOCK. But meanwhile,
we also want the window_clamp to grow according to the ratio, so that
the window can grow beyond the original 0.25 * rcvbuf.

> 
> For autotuned flows (majority of the cases), tp->window_clamp is
> changed from tcp_rcv_space_adjust()
> 
> I think we need to audit a bit more all tp->window_clamp changes.
> 
> > >
> > > This patch updates window_clamp along with scaling_ratio. It changes the
> > > calculation of the initial rcv_wscale as well to make sure the scale
> > > factor is also not capped by the initial window_clamp.
> >
> > This is very suspicious.
> >
> > >
> > > A comment from Tycho Andersen <tycho@tycho.pizza> is "What happens if
> > > someone has done setsockopt(sk, TCP_WINDOW_CLAMP) explicitly; will this
> > > and the above not violate userspace's desire to clamp the window size?".
> > > This comment is not addressed in this patch because the existing code
> > > also updates window_clamp at several places without checking if
> > > TCP_WINDOW_CLAMP is set by user space. Adding this check now may break
> > > certain user space assumption (similar to how the original patch broke
> > > the assumption of buffer overhead being 50%). For example, if a user
> > > space program sets TCP_WINDOW_CLAMP but the applicaiton behavior relies
> > > on window_clamp adjusted by the kernel as of today.
> >
> > Quite frankly I would prefer we increase tcp_rmem[] sysctls, instead
> > of trying to accomodate
> > with too small SO_RCVBUF values.
> >
> > This would benefit old applications that were written 20 years ago.

The application is kafka and it has a default config of 64KB SO_RCVBUF
(https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html#receive-buffer-bytes)
so in this case it's limitted by SO_RCVBUF and not tcp_rmem. It also has
a default request timeout 30 seconds
(https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html#request-timeout-ms)
The combination of these two configs requires the certain amount of app
data (in our case 10M) to be transfer within 30 seconds. But a 32k
window size can't achieve this, causing app timeout. Our goal was to
upgrade the kernel without having to update applications if possible.
Eric Dumazet April 3, 2024, 4:43 p.m. UTC | #5
On Wed, Apr 3, 2024 at 6:30 PM Hechao Li <hli@netflix.com> wrote:
>
> On 24/04/03 04:49PM, Eric Dumazet wrote:
> > On Wed, Apr 3, 2024 at 4:22 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > On Tue, Apr 2, 2024 at 11:56 PM Hechao Li <hli@netflix.com> wrote:
> > > >
> > > > After commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"),
> > > > we noticed an application-level timeout due to reduced throughput. This
> > > > can be reproduced by the following minimal client and server program.
> > > >
> > > > server:
> > > >
> > > ...
> > > >
> > > > Before the commit, it takes around 22 seconds to transfer 10M data.
> > > > After the commit, it takes 40 seconds. Because our application has a
> > > > 30-second timeout, this regression broke the application.
> > > >
> > > > The reason that it takes longer to transfer data is that
> > > > tp->scaling_ratio is initialized to a value that results in ~0.25 of
> > > > rcvbuf. In our case, SO_RCVBUF is set to 65536 by the application, which
> > > > translates to 2 * 65536 = 131,072 bytes in rcvbuf and hence a ~28k
> > > > initial receive window.
> > >
> > > What driver are you using, what MTU is set ?
>
> The driver is AWS ENA driver. This is cross-region/internet traffic, so
> the MTU is 1500.
>
> > >
> > > If you get a 0.25 ratio, that is because a driver is oversizing rx skbs.
> > >
> > > SO_RCVBUF 65536 would map indeed to 32768 bytes of payload.
> > >
>
> The 0.25 ratio is the initial default ratio calculated using
>
> #define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
>                                    SKB_TRUESIZE(4096))
>
> I think this is a constant 0.25, no?

This depends on skb metadata size, which changes over time.

With MAX_SKB_FRAGS == 17, this is .25390625

With MAX_SKB_FRAGS == 45, this is .234375


>
> Later with skb->len/skb->truesize, we get 0.66. However, the window
> can't grow to this ratio because window_clamp stays at the initial
> value, which is the initial tcp_full_space(sk), which is roughly 0.25 *
> rcvbuf.

Sure. Please address Jakub feedback about tests.
Eric Dumazet April 4, 2024, 10:58 a.m. UTC | #6
On Wed, Apr 3, 2024 at 6:43 PM Eric Dumazet <edumazet@google.com> wrote:

>
> >
> > Later with skb->len/skb->truesize, we get 0.66. However, the window
> > can't grow to this ratio because window_clamp stays at the initial
> > value, which is the initial tcp_full_space(sk), which is roughly 0.25 *
> > rcvbuf.
>
> Sure. Please address Jakub feedback about tests.

I think a less risky patch would be the following one.

If you agree, send a V2 of the patch.

Also remove from the changelog all the C code, this has no real value there.
changelog should be not too small, not too big.
If you want to capture this test code, please cook a separate patch
for net-next to add it in tools/testing/selftests/net
(But I guess iperf3 could be used instead, iperf3 is already used in
tools/testing/selftests)

Thanks !

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6ae35199d3b3c159ba029ff74b109c56a7c7d2fc..2bcf30381d75f84acf3b0276a4b348edeb7f0781
100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1539,11 +1539,10 @@ static inline int tcp_space_from_win(const
struct sock *sk, int win)
        return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
 }

-/* Assume a conservative default of 1200 bytes of payload per 4K page.
+/* Assume a 50% default for skb->len/skb->truesize ratio.
  * This may be adjusted later in tcp_measure_rcv_mss().
  */
-#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
-                                  SKB_TRUESIZE(4096))
+#define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1))

 static inline void tcp_scaling_ratio_init(struct sock *sk)
 {
Neal Cardwell April 9, 2024, 4:51 p.m. UTC | #7
On Wed, Apr 3, 2024 at 12:30 PM Hechao Li <hli@netflix.com> wrote:
> The application is kafka and it has a default config of 64KB SO_RCVBUF
> (https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html#receive-buffer-bytes)
> so in this case it's limitted by SO_RCVBUF and not tcp_rmem. It also has
> a default request timeout 30 seconds
> (https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html#request-timeout-ms)
> The combination of these two configs requires the certain amount of app
> data (in our case 10M) to be transfer within 30 seconds. But a 32k
> window size can't achieve this, causing app timeout. Our goal was to
> upgrade the kernel without having to update applications if possible.

Hechao, can you please file a bug against Kafka to get them to stop
using SO_RCVBUF, and allow receive buffer autotuning? This default
value of 64 Kbytes will cripple performance in many scenarios,
especially for WAN traffic.

I guess that would boil down to asking for the default
receive.buffer.bytes to be -1 rather than 64 Kbytes.

Looks like you can file bugs here:
  https://issues.apache.org/jira/browse/KAFKA

thanks,
neal
Hechao Li April 9, 2024, 9:59 p.m. UTC | #8
On 24/04/09 12:51PM, Neal Cardwell wrote:
> On Wed, Apr 3, 2024 at 12:30 PM Hechao Li <hli@netflix.com> wrote:
> > The application is kafka and it has a default config of 64KB SO_RCVBUF
> > (https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html#receive-buffer-bytes)
> > so in this case it's limitted by SO_RCVBUF and not tcp_rmem. It also has
> > a default request timeout 30 seconds
> > (https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html#request-timeout-ms)
> > The combination of these two configs requires the certain amount of app
> > data (in our case 10M) to be transfer within 30 seconds. But a 32k
> > window size can't achieve this, causing app timeout. Our goal was to
> > upgrade the kernel without having to update applications if possible.
> 
> Hechao, can you please file a bug against Kafka to get them to stop
> using SO_RCVBUF, and allow receive buffer autotuning? This default
> value of 64 Kbytes will cripple performance in many scenarios,
> especially for WAN traffic.
> 
> I guess that would boil down to asking for the default
> receive.buffer.bytes to be -1 rather than 64 Kbytes.
> 
> Looks like you can file bugs here:
>   https://issues.apache.org/jira/browse/KAFKA
> 
> thanks,
> neal

Makes sense. Filed: https://issues.apache.org/jira/browse/KAFKA-16496

I don't know the reason why kafka set 64k as the default in the first
place. But I was wondering if there is any use case that requires the
user to set SO_RCVBUF rather than auto tuning? Eventually do we want to
depreacte the support of setting SO_RCVBUF at all?

Thank you.

Hechao
diff mbox series

Patch

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5d874817a78d..a0cfa2b910d5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,9 +237,13 @@  static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
 		 */
 		if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
 			u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+			struct tcp_sock *tp = tcp_sk(sk);
 
 			do_div(val, skb->truesize);
-			tcp_sk(sk)->scaling_ratio = val ? val : 1;
+			tp->scaling_ratio = val ? val : 1;
+
+			/* Make the window_clamp follow along. */
+			tp->window_clamp = tcp_full_space(sk);
 		}
 		icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
 					       tcp_sk(sk)->advmss);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3167ad96567..2341e3f9db58 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -239,7 +239,7 @@  void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
 		/* Set window scaling on max possible window */
 		space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
 		space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
-		space = min_t(u32, space, *window_clamp);
+		space = min_t(u32, space, sk->sk_rcvbuf);
 		*rcv_wscale = clamp_t(int, ilog2(space) - 15,
 				      0, TCP_MAX_WSCALE);
 	}