diff mbox

[01/18] libceph: add scatterlist messenger data type

Message ID 1438161835-27960-1-git-send-email-mchristi@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mike Christie July 29, 2015, 9:23 a.m. UTC
From: Mike Christie <michaelc@cs.wisc.edu>

LIO uses scatterlist for its page/data management. This patch
adds a scatterlist messenger data type, so LIO can pass its sg
down directly to rbd.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
---
 include/linux/ceph/messenger.h  | 13 ++++++
 include/linux/ceph/osd_client.h | 12 +++++-
 net/ceph/messenger.c            | 96 +++++++++++++++++++++++++++++++++++++++++
 net/ceph/osd_client.c           | 26 +++++++++++
 4 files changed, 146 insertions(+), 1 deletion(-)

Comments

Alex Elder July 29, 2015, 1:34 p.m. UTC | #1
On 07/29/2015 04:23 AM, mchristi@redhat.com wrote:
> From: Mike Christie <michaelc@cs.wisc.edu>
> 
> LIO uses scatterlist for its page/data management. This patch
> adds a scatterlist messenger data type, so LIO can pass its sg
> down directly to rbd.
> 
> Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>

I'm not going to be able to review all of these, and this
isnt' even a complete review.  But it's something...

You're clearly on the right track, but I want to provide
a meaningful review for correctness and design so I'm
looking for a bit more information.

> ---
>  include/linux/ceph/messenger.h  | 13 ++++++
>  include/linux/ceph/osd_client.h | 12 +++++-
>  net/ceph/messenger.c            | 96 +++++++++++++++++++++++++++++++++++++++++
>  net/ceph/osd_client.c           | 26 +++++++++++
>  4 files changed, 146 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 3775327..bc1bde8 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -79,6 +79,7 @@ enum ceph_msg_data_type {
>  #ifdef CONFIG_BLOCK
>  	CEPH_MSG_DATA_BIO,	/* data source/destination is a bio list */
>  #endif /* CONFIG_BLOCK */
> +	CEPH_MSG_DATA_SG,	/* data source/destination is a scatterlist */
>  };
>  
>  static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
> @@ -90,6 +91,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
>  #ifdef CONFIG_BLOCK
>  	case CEPH_MSG_DATA_BIO:
>  #endif /* CONFIG_BLOCK */
> +	case CEPH_MSG_DATA_SG:
>  		return true;
>  	default:
>  		return false;
> @@ -112,6 +114,11 @@ struct ceph_msg_data {
>  			unsigned int	alignment;	/* first page */
>  		};
>  		struct ceph_pagelist	*pagelist;
> +		struct {
> +			struct scatterlist *sgl;
> +			unsigned int	sgl_init_offset;
> +			u64		sgl_length;
> +		};

Can you supply a short explanation of what these fields
represent?  It seems sgl_init_offset is the offset of the
starting byte in the sgl, but is the purpose of that for
page offset calculation, or does it represent an offset
into the total length of the sgl?  Or put another way,
does sgl_init_offset represent some portion of the
sgl_length that has been consumed already (and so the
initial residual length is sgl_length - sgl_init_offset)?

>  	};
>  };
>  
> @@ -139,6 +146,10 @@ struct ceph_msg_data_cursor {
>  			struct page	*page;		/* page from list */
>  			size_t		offset;		/* bytes from list */
>  		};
> +		struct {
> +			struct scatterlist	*sg;		/* curr sg */

							/* current sg */

> +			unsigned int		sg_consumed;

Here too, what does sg_consumed represent with respect to the
initial offset and the length?

I guess I'm going to stop with that.  It'll be a lot easier
for me to review this if I'm sure I'm sure I understand what
these represent.  Thanks.

					-Alex

> +		};
>  	};
>  };
>  

. . .

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Christie July 29, 2015, 5:49 p.m. UTC | #2
On 07/29/2015 08:34 AM, Alex Elder wrote:
> On 07/29/2015 04:23 AM, mchristi@redhat.com wrote:
>> From: Mike Christie <michaelc@cs.wisc.edu>
>>
>> LIO uses scatterlist for its page/data management. This patch
>> adds a scatterlist messenger data type, so LIO can pass its sg
>> down directly to rbd.
>>
>> Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
> 
> I'm not going to be able to review all of these, and this
> isnt' even a complete review.  But it's something...

No problem. Thanks for any comments.

> 
> You're clearly on the right track, but I want to provide
> a meaningful review for correctness and design so I'm
> looking for a bit more information.
> 
>> ---
>>  include/linux/ceph/messenger.h  | 13 ++++++
>>  include/linux/ceph/osd_client.h | 12 +++++-
>>  net/ceph/messenger.c            | 96 +++++++++++++++++++++++++++++++++++++++++
>>  net/ceph/osd_client.c           | 26 +++++++++++
>>  4 files changed, 146 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
>> index 3775327..bc1bde8 100644
>> --- a/include/linux/ceph/messenger.h
>> +++ b/include/linux/ceph/messenger.h
>> @@ -79,6 +79,7 @@ enum ceph_msg_data_type {
>>  #ifdef CONFIG_BLOCK
>>  	CEPH_MSG_DATA_BIO,	/* data source/destination is a bio list */
>>  #endif /* CONFIG_BLOCK */
>> +	CEPH_MSG_DATA_SG,	/* data source/destination is a scatterlist */
>>  };
>>  
>>  static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
>> @@ -90,6 +91,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
>>  #ifdef CONFIG_BLOCK
>>  	case CEPH_MSG_DATA_BIO:
>>  #endif /* CONFIG_BLOCK */
>> +	case CEPH_MSG_DATA_SG:
>>  		return true;
>>  	default:
>>  		return false;
>> @@ -112,6 +114,11 @@ struct ceph_msg_data {
>>  			unsigned int	alignment;	/* first page */
>>  		};
>>  		struct ceph_pagelist	*pagelist;
>> +		struct {
>> +			struct scatterlist *sgl;
>> +			unsigned int	sgl_init_offset;
>> +			u64		sgl_length;
>> +		};
> 
> Can you supply a short explanation of what these fields
> represent?  It seems sgl_init_offset is the offset of the
> starting byte in the sgl, but is the purpose of that for
> page offset calculation, or does it represent an offset
> into the total length of the sgl?  Or put another way,
> does sgl_init_offset represent some portion of the
> sgl_length that has been consumed already (and so the
> initial ressidual length is sgl_length - sgl_init_offset)?


sgl - starting scatterlist entry we are going to send/receive to/from.

sgl_init_offset - byte offset in the sgl above we will start executing
from. It is for cases like where a LIO command crossed segment/object
boundaries, so we had to break it up, and the first obj request ended up
in the middle of a scatterlist entry. For the second obj request we set
the sgl to the sg we ended on in the first request, and then set the
sgl_init_offset to where we left off in the first request.

So it basically allows me to not have to clone the list similar to how
the bio code does it. However, if we did clone it then I could just
manipulate the cloned sg's sg->offset instead of adding the
sgl_init_offset field.

sgl_length - number of bytes in the sgl we are going to send/receive.
This also is for the case where we broke up the LIO command into
multiple obj requests.


> 
>>  	};
>>  };
>>  
>> @@ -139,6 +146,10 @@ struct ceph_msg_data_cursor {
>>  			struct page	*page;		/* page from list */
>>  			size_t		offset;		/* bytes from list */
>>  		};
>> +		struct {
>> +			struct scatterlist	*sg;		/* curr sg */
> 
> 							/* current sg */
> 
>> +			unsigned int		sg_consumed;
> 
> Here too, what does sg_consumed represent with respect to the
> initial offset and the length?

It the number of bytes in the sgl we have sent/received. It is used by
the messenger advance code to track if we need to advance to the next sg
or if we still have data left from the current one.
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig July 29, 2015, 5:55 p.m. UTC | #3
On Wed, Jul 29, 2015 at 04:23:38AM -0500, mchristi@redhat.com wrote:
> From: Mike Christie <michaelc@cs.wisc.edu>
> 
> LIO uses scatterlist for its page/data management. This patch
> adds a scatterlist messenger data type, so LIO can pass its sg
> down directly to rbd.

Just as I mentioned for David's patches this is the wrong way to attack
your problem.  The block layer already supports WRITE SAME, and COMPARE
and WRITE nees to be supported at that level too insted of creating
artifical bypasses.
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Christie July 29, 2015, 10:59 p.m. UTC | #4
On 07/29/2015 12:55 PM, Christoph Hellwig wrote:
> On Wed, Jul 29, 2015 at 04:23:38AM -0500, mchristi@redhat.com wrote:
>> From: Mike Christie <michaelc@cs.wisc.edu>
>>
>> LIO uses scatterlist for its page/data management. This patch
>> adds a scatterlist messenger data type, so LIO can pass its sg
>> down directly to rbd.
> 
> Just as I mentioned for David's patches this is the wrong way to attack
> your problem.  The block layer already supports WRITE SAME, and COMPARE
> and WRITE nees to be supported at that level too insted of creating
> artifical bypasses.

Why do I have to use the block layer? I just want to map the se_cmd to a
ceph request and then put it back on the wire. I don't think I need any
of the block layer services. We will do things like io scheduling on the
OSD side. We just want to use LIO as more of a passthrough.
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Christie July 29, 2015, 11:40 p.m. UTC | #5
On 07/29/2015 05:59 PM, Mike Christie wrote:
> On 07/29/2015 12:55 PM, Christoph Hellwig wrote:
>> On Wed, Jul 29, 2015 at 04:23:38AM -0500, mchristi@redhat.com wrote:
>>> From: Mike Christie <michaelc@cs.wisc.edu>
>>>
>>> LIO uses scatterlist for its page/data management. This patch
>>> adds a scatterlist messenger data type, so LIO can pass its sg
>>> down directly to rbd.
>>
>> Just as I mentioned for David's patches this is the wrong way to attack
>> your problem.  The block layer already supports WRITE SAME, and COMPARE
>> and WRITE nees to be supported at that level too insted of creating
>> artifical bypasses.
> 
> Why do I have to use the block layer? I just want to map the se_cmd to a
> ceph request and then put it back on the wire. I don't think I need any
> of the block layer services. We will do things like io scheduling on the
> OSD side. We just want to use LIO as more of a passthrough.

Maybe I misunderstood you.

I guess I was viewing this similar to cephfs where it does not use rbd
and the block layer. It just makes ceph/rados calls directly using
libceph. I am using rbd.c for its helper/wrapper functions around the
libceph ones, but I could just make libceph calls directly too.

Were you saying because for lio support we need to do more block
layer'ish operations like write same, compare and write, etc than
cephfs, then I should not do the lio backend and we should always go
through rbd for lio support?

Is that for all operations? For distributed TMFs and PRs then are you
thinking I should make those more block layer based (some sort of queue
or block deivce callouts or REQ_ types), or should those still have some
sort of lio callouts which could call different locking/cluster APIs
like libceph?

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicholas A. Bellinger July 30, 2015, 7:34 a.m. UTC | #6
Hey Mike & HCH,

On Wed, 2015-07-29 at 18:40 -0500, Mike Christie wrote:
> On 07/29/2015 05:59 PM, Mike Christie wrote:
> > On 07/29/2015 12:55 PM, Christoph Hellwig wrote:
> >> On Wed, Jul 29, 2015 at 04:23:38AM -0500, mchristi@redhat.com wrote:
> >>> From: Mike Christie <michaelc@cs.wisc.edu>
> >>>
> >>> LIO uses scatterlist for its page/data management. This patch
> >>> adds a scatterlist messenger data type, so LIO can pass its sg
> >>> down directly to rbd.
> >>
> >> Just as I mentioned for David's patches this is the wrong way to attack
> >> your problem.  The block layer already supports WRITE SAME, and COMPARE
> >> and WRITE nees to be supported at that level too insted of creating
> >> artifical bypasses.
> > 
> > Why do I have to use the block layer? I just want to map the se_cmd to a
> > ceph request and then put it back on the wire. I don't think I need any
> > of the block layer services. We will do things like io scheduling on the
> > OSD side. We just want to use LIO as more of a passthrough.
> 
> Maybe I misunderstood you.
> 
> I guess I was viewing this similar to cephfs where it does not use rbd
> and the block layer. It just makes ceph/rados calls directly using
> libceph. I am using rbd.c for its helper/wrapper functions around the
> libceph ones, but I could just make libceph calls directly too.
> 
> Were you saying because for lio support we need to do more block
> layer'ish operations like write same, compare and write, etc than
> cephfs, then I should not do the lio backend and we should always go
> through rbd for lio support?

If we're using common request_queue function pointers it would avoid the
need to maintain an extra backend drivers, and be a generic offload
interface for other make_request_fn() based block drivers using
target-core to utilize.

In the WRITE_SAME + COMPARE_AND_WRITE pass-through cases, IBLOCK se_cmd
pass-through should be invoking the driver provided VAAI callbacks
directly if they exist, and disabling local target-core CDB emulation.

For EXTENDED_COPY pass-through, target-core copy-manager still needs to
be responsible for config_group dependencies across multiple se_device
backends, with a IBLOCK pass-through providing both block_device *src_bd
+ *dst_bd pointers into a request_queue callback for different PUSH/PULL
offload models.

It should also be able to fall back to local copy if source +
destination devices do not both support the same type copy-offload
pass-through.

> 
> Is that for all operations? For distributed TMFs and PRs then are you
> thinking I should make those more block layer based (some sort of queue
> or block deivce callouts or REQ_ types), or should those still have some
> sort of lio callouts which could call different locking/cluster APIs
> like libceph?

The PR-OUT logic for REGISTER w/ remote I_PORT and remote PREEMPT-*
currently obtains the necessary config_group dependencies, and would
need to be considered for request_queue based PR pass-through too.

Exposing target TMFs pass-through into request_queue is a different
beast..

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig July 30, 2015, 2:55 p.m. UTC | #7
On Wed, Jul 29, 2015 at 06:40:01PM -0500, Mike Christie wrote:
> I guess I was viewing this similar to cephfs where it does not use rbd
> and the block layer. It just makes ceph/rados calls directly using
> libceph. I am using rbd.c for its helper/wrapper functions around the
> libceph ones, but I could just make libceph calls directly too.
> 
> Were you saying because for lio support we need to do more block
> layer'ish operations like write same, compare and write, etc than
> cephfs, then I should not do the lio backend and we should always go
> through rbd for lio support?

I'd really prefer that.  We have other users for these facilities as
well, and I'd much prefer having block layer support rather than working
around it.

> Is that for all operations? For distributed TMFs and PRs then are you
> thinking I should make those more block layer based (some sort of queue
> or block deivce callouts or REQ_ types), or should those still have some
> sort of lio callouts which could call different locking/cluster APIs
> like libceph?

Yes.  FYI, I've pushed out my WIP work for PRs here:

http://git.infradead.org/users/hch/scsi.git/shortlog/refs/heads/pr-api

TMFs are a bit of boderline case, but instead of needing special
bypasses I'd rather find a way to add them.  For example we already
have TMF ioctls for SCSI, so we might as well pull this up to the block
layer.

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Disseldorp Feb. 4, 2016, 10:33 a.m. UTC | #8
On Wed, 29 Jul 2015 04:23:38 -0500, mchristi@redhat.com wrote:

> From: Mike Christie <michaelc@cs.wisc.edu>
> 
> LIO uses scatterlist for its page/data management. This patch
> adds a scatterlist messenger data type, so LIO can pass its sg
> down directly to rbd.
> 
> Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>

...

>  /*
> + * For a sg data item, a piece is whatever remains of the next
> + * entry in the current sg entry, or the first entry in the next
> + * sg in the list.
> + */
> +static void ceph_msg_data_sg_cursor_init(struct ceph_msg_data_cursor *cursor,
> +					 size_t length)
> +{
> +	struct ceph_msg_data *data = cursor->data;
> +	struct scatterlist *sg;
> +
> +	BUG_ON(data->type != CEPH_MSG_DATA_SG);
> +
> +	sg = data->sgl;
> +	BUG_ON(!sg);
> +
> +	cursor->resid = min_t(u64, length, data->sgl_length);
> +	cursor->sg = sg;
> +	cursor->sg_consumed = data->sgl_init_offset;
> +	cursor->last_piece = cursor->resid <= sg->length;
> +}

Just in case the CEPH_MSG_DATA_SG changes are picked up, the
cursor->last_piece calculation here needs to take into account the
data->sgl_init_offset:
	if (cursor->resid <= (sg->length - data->sgl_init_offset))
		cursor->last_piece = true;
	...

Cheers, David
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 3775327..bc1bde8 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -79,6 +79,7 @@  enum ceph_msg_data_type {
 #ifdef CONFIG_BLOCK
 	CEPH_MSG_DATA_BIO,	/* data source/destination is a bio list */
 #endif /* CONFIG_BLOCK */
+	CEPH_MSG_DATA_SG,	/* data source/destination is a scatterlist */
 };
 
 static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
@@ -90,6 +91,7 @@  static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
 #endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_SG:
 		return true;
 	default:
 		return false;
@@ -112,6 +114,11 @@  struct ceph_msg_data {
 			unsigned int	alignment;	/* first page */
 		};
 		struct ceph_pagelist	*pagelist;
+		struct {
+			struct scatterlist *sgl;
+			unsigned int	sgl_init_offset;
+			u64		sgl_length;
+		};
 	};
 };
 
@@ -139,6 +146,10 @@  struct ceph_msg_data_cursor {
 			struct page	*page;		/* page from list */
 			size_t		offset;		/* bytes from list */
 		};
+		struct {
+			struct scatterlist	*sg;		/* curr sg */
+			unsigned int		sg_consumed;
+		};
 	};
 };
 
@@ -294,6 +305,8 @@  extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
 extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
 				size_t length);
 #endif /* CONFIG_BLOCK */
+extern void ceph_msg_data_add_sg(struct ceph_msg *msg, struct scatterlist *sgl,
+				 unsigned int sgl_init_offset, u64 length);
 
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 				     bool can_fail);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 0890167..2152f06 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -52,6 +52,7 @@  enum ceph_osd_data_type {
 #ifdef CONFIG_BLOCK
 	CEPH_OSD_DATA_TYPE_BIO,
 #endif /* CONFIG_BLOCK */
+	CEPH_OSD_DATA_TYPE_SG,
 };
 
 struct ceph_osd_data {
@@ -70,6 +71,11 @@  struct ceph_osd_data {
 			struct bio	*bio;		/* list of bios */
 			size_t		bio_length;	/* total in list */
 		};
+		struct {
+			struct scatterlist *sgl;
+			size_t		sgl_length;
+			unsigned int	sgl_init_offset;
+		};
 #endif /* CONFIG_BLOCK */
 	};
 };
@@ -313,7 +319,11 @@  extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *,
 					unsigned int which,
 					struct bio *bio, size_t bio_length);
 #endif /* CONFIG_BLOCK */
-
+extern void osd_req_op_extent_osd_data_sg(struct ceph_osd_request *,
+					unsigned int which,
+					struct scatterlist *sgl,
+					unsigned int init_sg_offset,
+					u64 length);
 extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
 					unsigned int which,
 					struct ceph_pagelist *pagelist);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index e3be1d2..08d39fb 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -893,6 +893,75 @@  static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
 #endif /* CONFIG_BLOCK */
 
 /*
+ * For a sg data item, a piece is whatever remains of the next
+ * entry in the current sg entry, or the first entry in the next
+ * sg in the list.
+ */
+static void ceph_msg_data_sg_cursor_init(struct ceph_msg_data_cursor *cursor,
+					 size_t length)
+{
+	struct ceph_msg_data *data = cursor->data;
+	struct scatterlist *sg;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_SG);
+
+	sg = data->sgl;
+	BUG_ON(!sg);
+
+	cursor->resid = min_t(u64, length, data->sgl_length);
+	cursor->sg = sg;
+	cursor->sg_consumed = data->sgl_init_offset;
+	cursor->last_piece = cursor->resid <= sg->length;
+}
+
+static struct page *ceph_msg_data_sg_next(struct ceph_msg_data_cursor *cursor,
+					  size_t *page_offset, size_t *length)
+{
+	struct ceph_msg_data *data = cursor->data;
+	struct scatterlist *sg;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_SG);
+
+	sg = cursor->sg;
+	BUG_ON(!sg);
+
+	*page_offset = sg->offset + cursor->sg_consumed;
+
+	if (cursor->last_piece)
+		*length = cursor->resid;
+	else
+		*length = sg->length - cursor->sg_consumed;
+
+	/* currently support non clustered sg pages */
+	return sg_page(sg);
+}
+
+static bool ceph_msg_data_sg_advance(struct ceph_msg_data_cursor *cursor,
+				     size_t bytes)
+{
+	BUG_ON(cursor->data->type != CEPH_MSG_DATA_SG);
+
+	/* Advance the cursor offset */
+	BUG_ON(cursor->resid < bytes);
+	cursor->resid -= bytes;
+	cursor->sg_consumed += bytes;
+
+	if (!bytes || cursor->sg_consumed < cursor->sg->length)
+		return false;	/* more bytes to process in the current page */
+
+	if (!cursor->resid)
+		return false;	/* no more data */
+
+	/* For WRITE_SAME we have a single sg that is written over and over */
+	if (sg_next(cursor->sg))
+		cursor->sg = sg_next(cursor->sg);
+	cursor->sg_consumed = 0;
+
+	cursor->last_piece = cursor->resid <= cursor->sg->length;
+	return true;
+}
+
+/*
  * For a page array, a piece comes from the first page in the array
  * that has not already been fully consumed.
  */
@@ -1075,6 +1144,9 @@  static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
 		ceph_msg_data_bio_cursor_init(cursor, length);
 		break;
 #endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_SG:
+		ceph_msg_data_sg_cursor_init(cursor, length);
+		break;
 	case CEPH_MSG_DATA_NONE:
 	default:
 		/* BUG(); */
@@ -1123,6 +1195,9 @@  static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
 		page = ceph_msg_data_bio_next(cursor, page_offset, length);
 		break;
 #endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_SG:
+		page = ceph_msg_data_sg_next(cursor, page_offset, length);
+		break;
 	case CEPH_MSG_DATA_NONE:
 	default:
 		page = NULL;
@@ -1159,6 +1234,9 @@  static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
 		new_piece = ceph_msg_data_bio_advance(cursor, bytes);
 		break;
 #endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_SG:
+		new_piece = ceph_msg_data_sg_advance(cursor, bytes);
+		break;
 	case CEPH_MSG_DATA_NONE:
 	default:
 		BUG();
@@ -3182,6 +3260,24 @@  void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
 EXPORT_SYMBOL(ceph_msg_data_add_bio);
 #endif	/* CONFIG_BLOCK */
 
+void ceph_msg_data_add_sg(struct ceph_msg *msg, struct scatterlist *sgl,
+			  unsigned int sgl_init_offset, u64 length)
+{
+	struct ceph_msg_data *data;
+
+	BUG_ON(!sgl);
+
+	data = ceph_msg_data_create(CEPH_MSG_DATA_SG);
+	BUG_ON(!data);
+	data->sgl = sgl;
+	data->sgl_length = length;
+	data->sgl_init_offset = sgl_init_offset;
+
+	list_add_tail(&data->links, &msg->data);
+	msg->data_length += length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_sg);
+
 /*
  * construct a new message with given type, size
  * the new msg has a ref count of 1.
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index f8178b7..fd0a52e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -128,6 +128,16 @@  static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
 }
 #endif /* CONFIG_BLOCK */
 
+static void ceph_osd_data_sg_init(struct ceph_osd_data *osd_data,
+				  struct scatterlist *sgl,
+				  unsigned int init_sg_offset, u64 length)
+{
+	osd_data->type = CEPH_OSD_DATA_TYPE_SG;
+	osd_data->sgl = sgl;
+	osd_data->sgl_length = length;
+	osd_data->sgl_init_offset = init_sg_offset;
+}
+
 #define osd_req_op_data(oreq, whch, typ, fld)	\
 	({						\
 		BUG_ON(whch >= (oreq)->r_num_ops);	\
@@ -206,6 +216,17 @@  void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
 #endif /* CONFIG_BLOCK */
 
+void osd_req_op_extent_osd_data_sg(struct ceph_osd_request *osd_req,
+			unsigned int which, struct scatterlist *sgl,
+			unsigned int init_sg_offset, u64 length)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+	ceph_osd_data_sg_init(osd_data, sgl, init_sg_offset, length);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_sg);
+
 static void osd_req_op_cls_request_info_pagelist(
 			struct ceph_osd_request *osd_req,
 			unsigned int which, struct ceph_pagelist *pagelist)
@@ -317,6 +338,8 @@  static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
 	case CEPH_OSD_DATA_TYPE_BIO:
 		return (u64)osd_data->bio_length;
 #endif /* CONFIG_BLOCK */
+	case CEPH_OSD_DATA_TYPE_SG:
+		return osd_data->sgl_length;
 	default:
 		WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
 		return 0;
@@ -727,6 +750,9 @@  static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
 		ceph_msg_data_add_bio(msg, osd_data->bio, length);
 #endif
+	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_SG) {
+		ceph_msg_data_add_sg(msg, osd_data->sgl,
+				     osd_data->sgl_init_offset, length);
 	} else {
 		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
 	}