From patchwork Fri Apr 19 22:50:43 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alex Elder X-Patchwork-Id: 2467461 Return-Path: X-Original-To: patchwork-ceph-devel@patchwork.kernel.org Delivered-To: patchwork-process-083081@patchwork1.kernel.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by patchwork1.kernel.org (Postfix) with ESMTP id 5A0523FD8C for ; Fri, 19 Apr 2013 22:50:47 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S965037Ab3DSWuq (ORCPT ); Fri, 19 Apr 2013 18:50:46 -0400 Received: from mail-ie0-f173.google.com ([209.85.223.173]:43323 "EHLO mail-ie0-f173.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S964846Ab3DSWuq (ORCPT ); Fri, 19 Apr 2013 18:50:46 -0400 Received: by mail-ie0-f173.google.com with SMTP id k5so5256742iea.4 for ; Fri, 19 Apr 2013 15:50:45 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=20120113; h=x-received:message-id:date:from:user-agent:mime-version:to:subject :references:in-reply-to:content-type:content-transfer-encoding :x-gm-message-state; bh=7D9M3iIlUy9fwod/uON0R6GekY2mGHMb2l9XiCpmigI=; b=Ghr6IdIWnLxT0Hv39zHYf7YFhUCTLBxvto3eZPBA0EROyanvmPf+v4yZcb0kwIX4RT 5cJ1mIVXZbTohyyrHYjFBfbaPi7dLRaPCnSLILw6GzzYFzgFK8QOatXpNoDc+8AoHTOM 7zAfJCiugGXqPEez1/VB1nI99eLFk69xom0HfyquycisIN89l6foHVZtHCO9trwq7PbE w3Nz11VWbn3suxFe4/rGxx6IWmqditl5CaTm9XbsLc8KlwL1e2PLTdwz9HkGf3QSpE6D 4psINv2NRB5aFLu6a29vcW/wQq6oZpfDj71NWKBbbpLJmGLppEMzJ9dU90tdUEBJqdew 2KjQ== X-Received: by 10.50.57.78 with SMTP id g14mr4473733igq.87.1366411845546; Fri, 19 Apr 2013 15:50:45 -0700 (PDT) Received: from [172.22.22.4] (c-71-195-31-37.hsd1.mn.comcast.net. [71.195.31.37]) by mx.google.com with ESMTPS id xd4sm5398192igb.3.2013.04.19.15.50.44 (version=TLSv1 cipher=ECDHE-RSA-RC4-SHA bits=128/128); Fri, 19 Apr 2013 15:50:44 -0700 (PDT) Message-ID: <5171CA43.5070200@inktank.com> Date: Fri, 19 Apr 2013 17:50:43 -0500 From: Alex Elder User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130329 Thunderbird/17.0.5 MIME-Version: 1.0 To: ceph-devel Subject: [PATCH 1/2] rbd: implement full object parent reads References: <5171C963.2050402@inktank.com> In-Reply-To: <5171C963.2050402@inktank.com> X-Gm-Message-State: ALoCoQm13uPYKfKtGTRvz899URl8Wo4tDZq3k/0racOcaQwhSUeztuMqaJYEBMfjGdQlR97KeiUm Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org As a step toward implementing layered writes, implement reading the data for a target object from the parent image for a write request whose target object is known to not exist. Add a copyup_pages field to an image request to track the page array used (only) for such a request. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 143 insertions(+), 9 deletions(-) { struct rbd_obj_request *orig_request; @@ -1996,7 +2126,7 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) obj_request_existence_set(orig_request, false); } else if (result) { orig_request->result = result; - goto out_err; + goto out; } /* @@ -2004,7 +2134,7 @@ static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) * whether the target object exists. */ orig_request->result = rbd_img_obj_request_submit(orig_request); -out_err: +out: if (orig_request->result) rbd_obj_request_complete(orig_request); rbd_obj_request_put(orig_request); @@ -2070,15 +2200,13 @@ out: static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; + bool known; rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; rbd_assert(img_request); - /* (At the moment we don't care whether it exists or not...) */ - (void) obj_request_exists_test; - /* * Only layered writes need special handling. If it's not a * layered write, or it is a layered write but we know the @@ -2087,7 +2215,8 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) */ if (!img_request_write_test(img_request) || !img_request_layered_test(img_request) || - obj_request_known_test(obj_request)) { + ((known = obj_request_known_test(obj_request)) && + obj_request_exists_test(obj_request))) { struct rbd_device *rbd_dev; struct ceph_osd_client *osdc; @@ -2099,10 +2228,15 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) } /* - * It's a layered write and we don't know whether the target - * exists. Issue existence check; once that completes the - * original request will be submitted again. + * It's a layered write. The target object might exist but + * we may not know that yet. If we know it doesn't exist, + * start by reading the data for the full target object from + * the parent so we can use it for a copyup to the target. */ + if (known) + return rbd_img_obj_parent_read_full(obj_request); + + /* We don't know whether the target exists. Go find out. */ return rbd_img_obj_exists_submit(obj_request); } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 91fcf36..c5d0619 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -250,6 +250,7 @@ struct rbd_img_request { struct request *rq; /* block request */ struct rbd_obj_request *obj_request; /* obj req initiator */ }; + struct page **copyup_pages; spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; rbd_img_callback_t callback; @@ -350,6 +351,8 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock); static LIST_HEAD(rbd_client_list); /* clients */ static DEFINE_SPINLOCK(rbd_client_list_lock); +static int rbd_img_request_submit(struct rbd_img_request *img_request); + static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); @@ -1956,6 +1959,133 @@ out_unwind: return -ENOMEM; } +static void +rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) +{ + struct rbd_obj_request *orig_request; + struct page **pages; + u32 page_count; + int result; + u64 obj_size; + u64 xferred; + + rbd_assert(img_request_child_test(img_request)); + + /* First get what we need from the image request */ + + pages = img_request->copyup_pages; + rbd_assert(pages != NULL); + img_request->copyup_pages = NULL; + + orig_request = img_request->obj_request; + rbd_assert(orig_request != NULL); + + result = img_request->result; + obj_size = img_request->length; + xferred = img_request->xferred; + + rbd_img_request_put(img_request); + + obj_request_existence_set(orig_request, true); + + page_count = (u32)calc_pages_for(0, obj_size); + ceph_release_page_vector(pages, page_count); + + /* Resubmit the original request (for now). */ + + orig_request->result = rbd_img_obj_request_submit(orig_request); + if (orig_request->result) { + obj_request_done_set(orig_request); + rbd_obj_request_complete(orig_request); + } +} + +/* + * Read from the parent image the range of data that covers the + * entire target of the given object request. This is used for + * satisfying a layered image write request when the target of an + * object request from the image request does not exist. + * + * A page array big enough to hold the returned data is allocated + * and supplied to rbd_img_request_fill() as the "data descriptor." + * When the read completes, this page array will be transferred to + * the original object request for the copyup operation. + * + * If an error occurs, record it as the result of the original + * object request and mark it done so it gets completed. + */ +static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = NULL; + struct rbd_img_request *parent_request = NULL; + struct rbd_device *rbd_dev; + u64 img_offset; + u64 length; + struct page **pages = NULL; + u32 page_count; + int result; + + rbd_assert(obj_request_img_data_test(obj_request)); + rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + + img_request = obj_request->img_request; + rbd_assert(img_request != NULL); + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev->parent != NULL); + + /* + * Determine the byte range covered by the object in the + * child image to which the original request was to be sent. + */ + img_offset = obj_request->img_offset - obj_request->offset; + length = (u64)1 << rbd_dev->header.obj_order; + + /* + * Allocate a page array big enough to receive the data read + * from the parent. + */ + page_count = (u32)calc_pages_for(0, length); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) { + result = PTR_ERR(pages); + pages = NULL; + goto out_err; + } + + result = -ENOMEM; + parent_request = rbd_img_request_create(rbd_dev->parent, + img_offset, length, + false, true); + if (!parent_request) + goto out_err; + rbd_obj_request_get(obj_request); + parent_request->obj_request = obj_request; + + result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); + if (result) + goto out_err; + parent_request->copyup_pages = pages; + + parent_request->callback = rbd_img_obj_parent_read_full_callback; + result = rbd_img_request_submit(parent_request); + if (!result) + return 0; + + parent_request->copyup_pages = NULL; + parent_request->obj_request = NULL; + rbd_obj_request_put(obj_request); +out_err: + if (pages) + ceph_release_page_vector(pages, page_count); + if (parent_request) + rbd_img_request_put(parent_request); + obj_request->result = result; + obj_request->xferred = 0; + obj_request_done_set(obj_request); + + return result; +} + static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)