diff mbox series

[3/4] libceph: use MSG_SENDPAGE_NOTLAST with ceph_tcp_sendpage()

Message ID 20181123124020.4637-4-idryomov@gmail.com (mailing list archive)
State New, archived
Headers show
Series libceph: fix ceph_tcp_sendpage() more hinting | expand

Commit Message

Ilya Dryomov Nov. 23, 2018, 12:40 p.m. UTC
Prevent do_tcp_sendpages() from calling tcp_push() (at least) once per
page.  Instead, arrange for tcp_push() to be called (at least) once per
data payload.  This results in more MSS-sized packets and fewer packets
overall (5-10% reduction in my tests with typical OSD request sizes).
See commits 2f5338442425 ("tcp: allow splice() to build full TSO
packets"), 35f9c09fe9c7 ("tcp: tcp_sendpages() should call tcp_push()
once") and ae62ca7b0321 ("tcp: fix MSG_SENDPAGE_NOTLAST logic") for
details.

Here is an example of a packet size histogram for 128K OSD requests
(MSS = 1448, top 5):

Before:

     SIZE    COUNT
     1448   777700
      952   127915
     1200    39238
     1219     9806
       21     5675

After:

     SIZE    COUNT
     1448   897280
       21     6201
     1019     2797
      643     2739
      376     2479

We could do slightly better by explicitly corking the socket but it's
not clear it's worth it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 21a743a3bd29..649faa626b35 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -560,12 +560,15 @@  static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
 	return r;
 }
 
+/*
+ * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ */
 static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
-		     int offset, size_t size, bool more)
+			     int offset, size_t size, int more)
 {
 	ssize_t (*sendpage)(struct socket *sock, struct page *page,
 			    int offset, size_t size, int flags);
-	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : 0);
+	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
 	int ret;
 
 	/*
@@ -1552,6 +1555,7 @@  static int write_partial_message_data(struct ceph_connection *con)
 	struct ceph_msg *msg = con->out_msg;
 	struct ceph_msg_data_cursor *cursor = &msg->cursor;
 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
 	u32 crc;
 
 	dout("%s %p msg %p\n", __func__, con, msg);
@@ -1580,8 +1584,10 @@  static int write_partial_message_data(struct ceph_connection *con)
 		}
 
 		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		if (length == cursor->total_resid)
+			more = MSG_MORE;
 		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
-					true);
+					more);
 		if (ret <= 0) {
 			if (do_datacrc)
 				msg->footer.data_crc = cpu_to_le32(crc);
@@ -1611,13 +1617,16 @@  static int write_partial_message_data(struct ceph_connection *con)
  */
 static int write_partial_skip(struct ceph_connection *con)
 {
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
 	int ret;
 
 	dout("%s %p %d left\n", __func__, con, con->out_skip);
 	while (con->out_skip > 0) {
 		size_t size = min(con->out_skip, (int) PAGE_SIZE);
 
-		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
+		if (size == con->out_skip)
+			more = MSG_MORE;
+		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
 		if (ret <= 0)
 			goto out;
 		con->out_skip -= ret;