diff mbox

[4/4] libceph: enable large, variable-sized OSD requests

Message ID 1455103447-23559-5-git-send-email-idryomov@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ilya Dryomov Feb. 10, 2016, 11:24 a.m. UTC
Turn r_ops into a flexible array member to enable large, consisting of
up to 16 ops, OSD requests.  The use case is scattered writeback in
cephfs and, as far as the kernel client is concerned, 16 is just a made
up number.

r_ops had size 3 for copyup+hint+write, but copyup is really a special
case - it can only happen once.  ceph_osd_request_cache is therefore
stuffed with num_ops=2 requests, anything bigger than that is allocated
with kmalloc().  req_mempool is backed by ceph_osd_request_cache, which
means either num_ops=1 or num_ops=2 for use_mempool=true - all existing
users (ceph_writepages_start(), ceph_osdc_writepages()) are fine with
that.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  2 --
 include/linux/ceph/osd_client.h |  6 ++++--
 net/ceph/osd_client.c           | 32 +++++++++++++++++++-------------
 3 files changed, 23 insertions(+), 17 deletions(-)

Comments

Yan, Zheng Feb. 10, 2016, 11:56 a.m. UTC | #1
> On Feb 10, 2016, at 19:24, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> Turn r_ops into a flexible array member to enable large, consisting of
> up to 16 ops, OSD requests.  The use case is scattered writeback in
> cephfs and, as far as the kernel client is concerned, 16 is just a made
> up number.
> 
> r_ops had size 3 for copyup+hint+write, but copyup is really a special
> case - it can only happen once.  ceph_osd_request_cache is therefore
> stuffed with num_ops=2 requests, anything bigger than that is allocated
> with kmalloc().  req_mempool is backed by ceph_osd_request_cache, which
> means either num_ops=1 or num_ops=2 for use_mempool=true - all existing
> users (ceph_writepages_start(), ceph_osdc_writepages()) are fine with
> that.
> 
> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> ---
> drivers/block/rbd.c             |  2 --
> include/linux/ceph/osd_client.h |  6 ++++--
> net/ceph/osd_client.c           | 32 +++++++++++++++++++-------------
> 3 files changed, 23 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 94f31bde73e8..08018710f363 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -1847,8 +1847,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
> 	if (osd_req->r_result < 0)
> 		obj_request->result = osd_req->r_result;
> 
> -	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
> -
> 	/*
> 	 * We support a 64-bit length, but ultimately it has to be
> 	 * passed to the block layer, which just supports a 32-bit
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index c6d1d603bacf..aada6a1383a4 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -43,7 +43,8 @@ struct ceph_osd {
> };
> 
> 
> -#define CEPH_OSD_MAX_OP	3
> +#define CEPH_OSD_SLAB_OPS	2
> +#define CEPH_OSD_MAX_OPS	16
> 
> enum ceph_osd_data_type {
> 	CEPH_OSD_DATA_TYPE_NONE = 0,
> @@ -139,7 +140,6 @@ struct ceph_osd_request {
> 
> 	/* request osd ops array  */
> 	unsigned int		r_num_ops;
> -	struct ceph_osd_req_op	r_ops[CEPH_OSD_MAX_OP];
> 
> 	/* these are updated on each send */
> 	__le32           *r_request_osdmap_epoch;
> @@ -175,6 +175,8 @@ struct ceph_osd_request {
> 	unsigned long     r_stamp;            /* send OR check time */
> 
> 	struct ceph_snap_context *r_snapc;    /* snap context for writes */
> +
> +	struct ceph_osd_req_op r_ops[];
> };
> 
> struct ceph_request_redirect {
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 8bf4f74064e5..37a0fc5273d1 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -338,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref)
> 	ceph_put_snap_context(req->r_snapc);
> 	if (req->r_mempool)
> 		mempool_free(req, req->r_osdc->req_mempool);
> -	else
> +	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
> 		kmem_cache_free(ceph_osd_request_cache, req);
> -
> +	else
> +		kfree(req);
> }
> 
> void ceph_osdc_get_request(struct ceph_osd_request *req)
> @@ -369,9 +370,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
> 	struct ceph_msg *msg;
> 	size_t msg_size;
> 
> -	BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
> -	BUG_ON(num_ops > CEPH_OSD_MAX_OP);
> -
> 	msg_size = 4 + 4 + 8 + 8 + 4+8;
> 	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
> 	msg_size += 1 + 8 + 4 + 4;     /* pg_t */
> @@ -383,14 +381,21 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
> 	msg_size += 4;
> 
> 	if (use_mempool) {
> +		BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
> 		req = mempool_alloc(osdc->req_mempool, gfp_flags);
> -		memset(req, 0, sizeof(*req));
> +	} else if (num_ops <= CEPH_OSD_SLAB_OPS) {
> +		req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
> 	} else {
> -		req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
> +		BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
> +		req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]),
> +			      gfp_flags);
> 	}
> -	if (req == NULL)
> +	if (unlikely(!req))
> 		return NULL;
> 
> +	/* req only, each op is zeroed in _osd_req_op_init() */
> +	memset(req, 0, sizeof(*req));
> +
> 	req->r_osdc = osdc;
> 	req->r_mempool = use_mempool;
> 	req->r_num_ops = num_ops;
> @@ -1809,7 +1814,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
> 
> 	ceph_decode_need(&p, end, 4, bad_put);
> 	numops = ceph_decode_32(&p);
> -	if (numops > CEPH_OSD_MAX_OP)
> +	if (numops > CEPH_OSD_MAX_OPS)
> 		goto bad_put;
> 	if (numops != req->r_num_ops)
> 		goto bad_put;
> @@ -2773,11 +2778,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages);
> 
> int ceph_osdc_setup(void)
> {
> +	size_t size = sizeof(struct ceph_osd_request) +
> +	    CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
> +
> 	BUG_ON(ceph_osd_request_cache);
> -	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
> -					sizeof (struct ceph_osd_request),
> -					__alignof__(struct ceph_osd_request),
> -					0, NULL);
> +	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
> +						   0, 0, NULL);
> 
> 	return ceph_osd_request_cache ? 0 : -ENOMEM;
> }

This approach is better than mine. But the code that calculates r_request/r_reply messages size get lost in your patch

Regards
Yan, Zheng

> -- 
> 2.4.3
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov Feb. 11, 2016, 9:57 a.m. UTC | #2
On Wed, Feb 10, 2016 at 12:56 PM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Feb 10, 2016, at 19:24, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> Turn r_ops into a flexible array member to enable large, consisting of
>> up to 16 ops, OSD requests.  The use case is scattered writeback in
>> cephfs and, as far as the kernel client is concerned, 16 is just a made
>> up number.
>>
>> r_ops had size 3 for copyup+hint+write, but copyup is really a special
>> case - it can only happen once.  ceph_osd_request_cache is therefore
>> stuffed with num_ops=2 requests, anything bigger than that is allocated
>> with kmalloc().  req_mempool is backed by ceph_osd_request_cache, which
>> means either num_ops=1 or num_ops=2 for use_mempool=true - all existing
>> users (ceph_writepages_start(), ceph_osdc_writepages()) are fine with
>> that.
>>
>> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
>> ---
>> drivers/block/rbd.c             |  2 --
>> include/linux/ceph/osd_client.h |  6 ++++--
>> net/ceph/osd_client.c           | 32 +++++++++++++++++++-------------
>> 3 files changed, 23 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>> index 94f31bde73e8..08018710f363 100644
>> --- a/drivers/block/rbd.c
>> +++ b/drivers/block/rbd.c
>> @@ -1847,8 +1847,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
>>       if (osd_req->r_result < 0)
>>               obj_request->result = osd_req->r_result;
>>
>> -     rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
>> -
>>       /*
>>        * We support a 64-bit length, but ultimately it has to be
>>        * passed to the block layer, which just supports a 32-bit
>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>> index c6d1d603bacf..aada6a1383a4 100644
>> --- a/include/linux/ceph/osd_client.h
>> +++ b/include/linux/ceph/osd_client.h
>> @@ -43,7 +43,8 @@ struct ceph_osd {
>> };
>>
>>
>> -#define CEPH_OSD_MAX_OP      3
>> +#define CEPH_OSD_SLAB_OPS    2
>> +#define CEPH_OSD_MAX_OPS     16
>>
>> enum ceph_osd_data_type {
>>       CEPH_OSD_DATA_TYPE_NONE = 0,
>> @@ -139,7 +140,6 @@ struct ceph_osd_request {
>>
>>       /* request osd ops array  */
>>       unsigned int            r_num_ops;
>> -     struct ceph_osd_req_op  r_ops[CEPH_OSD_MAX_OP];
>>
>>       /* these are updated on each send */
>>       __le32           *r_request_osdmap_epoch;
>> @@ -175,6 +175,8 @@ struct ceph_osd_request {
>>       unsigned long     r_stamp;            /* send OR check time */
>>
>>       struct ceph_snap_context *r_snapc;    /* snap context for writes */
>> +
>> +     struct ceph_osd_req_op r_ops[];
>> };
>>
>> struct ceph_request_redirect {
>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> index 8bf4f74064e5..37a0fc5273d1 100644
>> --- a/net/ceph/osd_client.c
>> +++ b/net/ceph/osd_client.c
>> @@ -338,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref)
>>       ceph_put_snap_context(req->r_snapc);
>>       if (req->r_mempool)
>>               mempool_free(req, req->r_osdc->req_mempool);
>> -     else
>> +     else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
>>               kmem_cache_free(ceph_osd_request_cache, req);
>> -
>> +     else
>> +             kfree(req);
>> }
>>
>> void ceph_osdc_get_request(struct ceph_osd_request *req)
>> @@ -369,9 +370,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>>       struct ceph_msg *msg;
>>       size_t msg_size;
>>
>> -     BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
>> -     BUG_ON(num_ops > CEPH_OSD_MAX_OP);
>> -
>>       msg_size = 4 + 4 + 8 + 8 + 4+8;
>>       msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
>>       msg_size += 1 + 8 + 4 + 4;     /* pg_t */
>> @@ -383,14 +381,21 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>>       msg_size += 4;
>>
>>       if (use_mempool) {
>> +             BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
>>               req = mempool_alloc(osdc->req_mempool, gfp_flags);
>> -             memset(req, 0, sizeof(*req));
>> +     } else if (num_ops <= CEPH_OSD_SLAB_OPS) {
>> +             req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
>>       } else {
>> -             req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
>> +             BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
>> +             req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]),
>> +                           gfp_flags);
>>       }
>> -     if (req == NULL)
>> +     if (unlikely(!req))
>>               return NULL;
>>
>> +     /* req only, each op is zeroed in _osd_req_op_init() */
>> +     memset(req, 0, sizeof(*req));
>> +
>>       req->r_osdc = osdc;
>>       req->r_mempool = use_mempool;
>>       req->r_num_ops = num_ops;
>> @@ -1809,7 +1814,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
>>
>>       ceph_decode_need(&p, end, 4, bad_put);
>>       numops = ceph_decode_32(&p);
>> -     if (numops > CEPH_OSD_MAX_OP)
>> +     if (numops > CEPH_OSD_MAX_OPS)
>>               goto bad_put;
>>       if (numops != req->r_num_ops)
>>               goto bad_put;
>> @@ -2773,11 +2778,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages);
>>
>> int ceph_osdc_setup(void)
>> {
>> +     size_t size = sizeof(struct ceph_osd_request) +
>> +         CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
>> +
>>       BUG_ON(ceph_osd_request_cache);
>> -     ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
>> -                                     sizeof (struct ceph_osd_request),
>> -                                     __alignof__(struct ceph_osd_request),
>> -                                     0, NULL);
>> +     ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
>> +                                                0, 0, NULL);
>>
>>       return ceph_osd_request_cache ? 0 : -ENOMEM;
>> }
>
> This approach is better than mine. But the code that calculates r_request/r_reply messages size get lost in your patch

r_request calculation is there, r_reply was missing because I wanted to
take another look at both r_request and r_reply - it can be a separate
commit though.  I'll pull wip-alloc-request into testing later today.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 94f31bde73e8..08018710f363 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1847,8 +1847,6 @@  static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	if (osd_req->r_result < 0)
 		obj_request->result = osd_req->r_result;
 
-	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
-
 	/*
 	 * We support a 64-bit length, but ultimately it has to be
 	 * passed to the block layer, which just supports a 32-bit
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index c6d1d603bacf..aada6a1383a4 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -43,7 +43,8 @@  struct ceph_osd {
 };
 
 
-#define CEPH_OSD_MAX_OP	3
+#define CEPH_OSD_SLAB_OPS	2
+#define CEPH_OSD_MAX_OPS	16
 
 enum ceph_osd_data_type {
 	CEPH_OSD_DATA_TYPE_NONE = 0,
@@ -139,7 +140,6 @@  struct ceph_osd_request {
 
 	/* request osd ops array  */
 	unsigned int		r_num_ops;
-	struct ceph_osd_req_op	r_ops[CEPH_OSD_MAX_OP];
 
 	/* these are updated on each send */
 	__le32           *r_request_osdmap_epoch;
@@ -175,6 +175,8 @@  struct ceph_osd_request {
 	unsigned long     r_stamp;            /* send OR check time */
 
 	struct ceph_snap_context *r_snapc;    /* snap context for writes */
+
+	struct ceph_osd_req_op r_ops[];
 };
 
 struct ceph_request_redirect {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8bf4f74064e5..37a0fc5273d1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -338,9 +338,10 @@  static void ceph_osdc_release_request(struct kref *kref)
 	ceph_put_snap_context(req->r_snapc);
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
-	else
+	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
 		kmem_cache_free(ceph_osd_request_cache, req);
-
+	else
+		kfree(req);
 }
 
 void ceph_osdc_get_request(struct ceph_osd_request *req)
@@ -369,9 +370,6 @@  struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	struct ceph_msg *msg;
 	size_t msg_size;
 
-	BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
-	BUG_ON(num_ops > CEPH_OSD_MAX_OP);
-
 	msg_size = 4 + 4 + 8 + 8 + 4+8;
 	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
 	msg_size += 1 + 8 + 4 + 4;     /* pg_t */
@@ -383,14 +381,21 @@  struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	msg_size += 4;
 
 	if (use_mempool) {
+		BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
 		req = mempool_alloc(osdc->req_mempool, gfp_flags);
-		memset(req, 0, sizeof(*req));
+	} else if (num_ops <= CEPH_OSD_SLAB_OPS) {
+		req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
 	} else {
-		req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
+		BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
+		req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]),
+			      gfp_flags);
 	}
-	if (req == NULL)
+	if (unlikely(!req))
 		return NULL;
 
+	/* req only, each op is zeroed in _osd_req_op_init() */
+	memset(req, 0, sizeof(*req));
+
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
 	req->r_num_ops = num_ops;
@@ -1809,7 +1814,7 @@  static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 	ceph_decode_need(&p, end, 4, bad_put);
 	numops = ceph_decode_32(&p);
-	if (numops > CEPH_OSD_MAX_OP)
+	if (numops > CEPH_OSD_MAX_OPS)
 		goto bad_put;
 	if (numops != req->r_num_ops)
 		goto bad_put;
@@ -2773,11 +2778,12 @@  EXPORT_SYMBOL(ceph_osdc_writepages);
 
 int ceph_osdc_setup(void)
 {
+	size_t size = sizeof(struct ceph_osd_request) +
+	    CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
+
 	BUG_ON(ceph_osd_request_cache);
-	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
-					sizeof (struct ceph_osd_request),
-					__alignof__(struct ceph_osd_request),
-					0, NULL);
+	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
+						   0, 0, NULL);
 
 	return ceph_osd_request_cache ? 0 : -ENOMEM;
 }