diff mbox

rbd: handle parent_overlap on writes correctly

Message ID 1402504849-9958-1-git-send-email-ilya.dryomov@inktank.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ilya Dryomov June 11, 2014, 4:40 p.m. UTC
The following check in rbd_img_obj_request_submit()

    rbd_dev->parent_overlap <= obj_request->img_offset

allows the fall through to the non-layered write case even if both
parent_overlap and obj_request->img_offset belong to the same RADOS
object.  This leads to data corruption, because the area to the left of
parent_overlap ends up unconditionally zero-filled instead of being
populated with parent data.  Suppose we want to write 1M to offset 6M
of image bar, which is a clone of foo@snap; object_size is 4M,
parent_overlap is 5M:

    rbd_data.<id>.0000000000000001
     ---------------------|----------------------|------------
    | should be copyup'ed | should be zeroed out | write ...
     ---------------------|----------------------|------------
   4M                    5M                     6M
                    parent_overlap    obj_request->img_offset

4..5M should be copyup'ed from foo, yet it is zero-filled, just like
5..6M is.

Given that the only striping mode kernel client currently supports is
chunking (i.e. stripe_unit == object_size, stripe_count == 1), round
parent_overlap up to the next object boundary for the purposes of the
overlap check.

Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
---
 drivers/block/rbd.c |   10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

Comments

Josh Durgin June 13, 2014, 1:26 a.m. UTC | #1
On 06/11/2014 09:40 AM, Ilya Dryomov wrote:
> The following check in rbd_img_obj_request_submit()
>
>      rbd_dev->parent_overlap <= obj_request->img_offset
>
> allows the fall through to the non-layered write case even if both
> parent_overlap and obj_request->img_offset belong to the same RADOS
> object.  This leads to data corruption, because the area to the left of
> parent_overlap ends up unconditionally zero-filled instead of being
> populated with parent data.  Suppose we want to write 1M to offset 6M
> of image bar, which is a clone of foo@snap; object_size is 4M,
> parent_overlap is 5M:
>
>      rbd_data.<id>.0000000000000001
>       ---------------------|----------------------|------------
>      | should be copyup'ed | should be zeroed out | write ...
>       ---------------------|----------------------|------------
>     4M                    5M                     6M
>                      parent_overlap    obj_request->img_offset
>
> 4..5M should be copyup'ed from foo, yet it is zero-filled, just like
> 5..6M is.
>
> Given that the only striping mode kernel client currently supports is
> chunking (i.e. stripe_unit == object_size, stripe_count == 1), round
> parent_overlap up to the next object boundary for the purposes of the
> overlap check.
>
> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
> ---

Good catch! This should be included in any stable kernels 3.10 or later
too.

Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

>   drivers/block/rbd.c |   10 +++++++++-
>   1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 8295b3afa8e0..813e673d49df 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -1366,6 +1366,14 @@ static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
>   	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
>   }
>
> +static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
> +{
> +	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
> +
> +	return obj_request->img_offset <
> +	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
> +}
> +
>   static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
>   {
>   	dout("%s: obj %p (was %d)\n", __func__, obj_request,
> @@ -2683,7 +2691,7 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
>   	 */
>   	if (!img_request_write_test(img_request) ||
>   		!img_request_layered_test(img_request) ||
> -		rbd_dev->parent_overlap <= obj_request->img_offset ||
> +		!obj_request_overlaps_parent(obj_request) ||
>   		((known = obj_request_known_test(obj_request)) &&
>   			obj_request_exists_test(obj_request))) {
>
>

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ilya Dryomov June 13, 2014, 8:06 a.m. UTC | #2
On Fri, Jun 13, 2014 at 5:26 AM, Josh Durgin <josh.durgin@inktank.com> wrote:
> On 06/11/2014 09:40 AM, Ilya Dryomov wrote:
>>
>> The following check in rbd_img_obj_request_submit()
>>
>>      rbd_dev->parent_overlap <= obj_request->img_offset
>>
>> allows the fall through to the non-layered write case even if both
>> parent_overlap and obj_request->img_offset belong to the same RADOS
>> object.  This leads to data corruption, because the area to the left of
>> parent_overlap ends up unconditionally zero-filled instead of being
>> populated with parent data.  Suppose we want to write 1M to offset 6M
>> of image bar, which is a clone of foo@snap; object_size is 4M,
>> parent_overlap is 5M:
>>
>>      rbd_data.<id>.0000000000000001
>>       ---------------------|----------------------|------------
>>      | should be copyup'ed | should be zeroed out | write ...
>>       ---------------------|----------------------|------------
>>     4M                    5M                     6M
>>                      parent_overlap    obj_request->img_offset
>>
>> 4..5M should be copyup'ed from foo, yet it is zero-filled, just like
>> 5..6M is.
>>
>> Given that the only striping mode kernel client currently supports is
>> chunking (i.e. stripe_unit == object_size, stripe_count == 1), round
>> parent_overlap up to the next object boundary for the purposes of the
>> overlap check.
>>
>> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
>> ---
>
>
> Good catch! This should be included in any stable kernels 3.10 or later
> too.
>
> Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

This commit in ceph-client already has the 3.10+ stable tag.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Elder June 30, 2014, 12:14 p.m. UTC | #3
On 06/11/2014 11:40 AM, Ilya Dryomov wrote:
> The following check in rbd_img_obj_request_submit()
> 
>     rbd_dev->parent_overlap <= obj_request->img_offset
> 
> allows the fall through to the non-layered write case even if both
> parent_overlap and obj_request->img_offset belong to the same RADOS
> object.  This leads to data corruption, because the area to the left of
> parent_overlap ends up unconditionally zero-filled instead of being
> populated with parent data.  Suppose we want to write 1M to offset 6M
> of image bar, which is a clone of foo@snap; object_size is 4M,
> parent_overlap is 5M:
> 
>     rbd_data.<id>.0000000000000001
>      ---------------------|----------------------|------------
>     | should be copyup'ed | should be zeroed out | write ...
>      ---------------------|----------------------|------------
>    4M                    5M                     6M
>                     parent_overlap    obj_request->img_offset
> 
> 4..5M should be copyup'ed from foo, yet it is zero-filled, just like
> 5..6M is.
> 
> Given that the only striping mode kernel client currently supports is
> chunking (i.e. stripe_unit == object_size, stripe_count == 1), round
> parent_overlap up to the next object boundary for the purposes of the
> overlap check.

I know Josh already reviewed this, but I wanted to make sure
I understood it.  Now I do.

Previously the first layered write request starting at the
"should be zeroed out" region would be treated as a normal
write.  Consequently the child object would get created, but
only the affected portion would get written, meaning the
"should be copyup'ed" region in the child object would be
zero filled.  Later reads to the "should be copyup'ed" would
see that child object existed, so the parent data will never
get filled into the "should be copyup'ed" area as it must be.

Rounding up like you did for this test makes sense--it makes
the granularity of this overlap test match the granularity
of the "exists" test.

Reviewed-by: Alex Elder <elder@linaro.org>

> 
> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
> ---
>  drivers/block/rbd.c |   10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 8295b3afa8e0..813e673d49df 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -1366,6 +1366,14 @@ static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
>  	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
>  }
>  
> +static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
> +{
> +	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
> +
> +	return obj_request->img_offset <
> +	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
> +}
> +
>  static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
>  {
>  	dout("%s: obj %p (was %d)\n", __func__, obj_request,
> @@ -2683,7 +2691,7 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
>  	 */
>  	if (!img_request_write_test(img_request) ||
>  		!img_request_layered_test(img_request) ||
> -		rbd_dev->parent_overlap <= obj_request->img_offset ||
> +		!obj_request_overlaps_parent(obj_request) ||
>  		((known = obj_request_known_test(obj_request)) &&
>  			obj_request_exists_test(obj_request))) {
>  
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8295b3afa8e0..813e673d49df 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1366,6 +1366,14 @@  static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
 }
 
+static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
+{
+	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
+
+	return obj_request->img_offset <
+	    round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
+}
+
 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
 {
 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
@@ -2683,7 +2691,7 @@  static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
 	 */
 	if (!img_request_write_test(img_request) ||
 		!img_request_layered_test(img_request) ||
-		rbd_dev->parent_overlap <= obj_request->img_offset ||
+		!obj_request_overlaps_parent(obj_request) ||
 		((known = obj_request_known_test(obj_request)) &&
 			obj_request_exists_test(obj_request))) {