[2/2] ceph osd: add support for new op cmpext
diff mbox

Message ID 1457591672-17430-3-git-send-email-mchristi@redhat.com
State New
Headers show

Commit Message

Michael Christie March 10, 2016, 6:34 a.m. UTC
This adds support for a new op cmpext. The request will read
extent.length bytes and compare them to extent.length bytes at
extent.offset on disk. If there is a miscompare the osd will return
-EILSEQ, and the mismatched buffer that was read.

rbd will use this in a multi op request to implement the
SCSI COMPARE_AND_WRITE request which is used by VMware for
its atomic test and set request.

v2:
- Merge David's tracing fixes.
- Instead of returning the mismatch offset and buffer on matching
failure just return the buffer. The client can figure out the offset
if it needs it.

Signed-off-by: Mike Christie <mchristi@redhat.com>
---
 src/include/rados.h     |  2 ++
 src/osd/ReplicatedPG.cc | 31 +++++++++++++++++++++++++++++++
 src/osd/ReplicatedPG.h  |  1 +
 src/tracing/osd.tp      | 22 ++++++++++++++++++++++
 4 files changed, 56 insertions(+)

Comments

David Disseldorp March 10, 2016, 12:03 p.m. UTC | #1
Hi Mike,

On Thu, 10 Mar 2016 00:34:32 -0600, Mike Christie wrote:

> This adds support for a new op cmpext. The request will read
> extent.length bytes and compare them to extent.length bytes at
> extent.offset on disk. If there is a miscompare the osd will return
> -EILSEQ, and the mismatched buffer that was read.
> 
> rbd will use this in a multi op request to implement the
> SCSI COMPARE_AND_WRITE request which is used by VMware for
> its atomic test and set request.
> 
> v2:
> - Merge David's tracing fixes.
> - Instead of returning the mismatch offset and buffer on matching
> failure just return the buffer. The client can figure out the offset
> if it needs it.

What's your reason for dropping the mismatch offset? The osd has
to perform the comparison, so might as well do something with the
result, even if it's ignored by the client in some cases.

Cheers, David
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Christie March 10, 2016, 5:06 p.m. UTC | #2
On 03/10/2016 06:03 AM, David Disseldorp wrote:
> Hi Mike,
> 
> On Thu, 10 Mar 2016 00:34:32 -0600, Mike Christie wrote:
> 
>> This adds support for a new op cmpext. The request will read
>> extent.length bytes and compare them to extent.length bytes at
>> extent.offset on disk. If there is a miscompare the osd will return
>> -EILSEQ, and the mismatched buffer that was read.
>>
>> rbd will use this in a multi op request to implement the
>> SCSI COMPARE_AND_WRITE request which is used by VMware for
>> its atomic test and set request.
>>
>> v2:
>> - Merge David's tracing fixes.
>> - Instead of returning the mismatch offset and buffer on matching
>> failure just return the buffer. The client can figure out the offset
>> if it needs it.
> 
> What's your reason for dropping the mismatch offset?

I was not sure if anyone else was going to use it. I can add it back. It
does not matter to me.
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Disseldorp March 10, 2016, 5:12 p.m. UTC | #3
On Thu, 10 Mar 2016 11:06:22 -0600, Mike Christie wrote:

> > What's your reason for dropping the mismatch offset?  
> 
> I was not sure if anyone else was going to use it. I can add it back. It
> does not matter to me.

Thanks - I'd prefer to keep it, given that it doesn't cost anything.

Cheers, David
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/src/include/rados.h b/src/include/rados.h
index 4d508c0..229d855 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -258,6 +258,7 @@  extern const char *ceph_osd_state_name(int s);
 									    \
 	/* ESX/SCSI */							    \
 	f(WRITESAME,	__CEPH_OSD_OP(WR, DATA, 38),	"write-same")	    \
+	f(CMPEXT,	__CEPH_OSD_OP(RD, DATA, 31),	"cmpext")	    \
 									    \
 	/** multi **/							    \
 	f(CLONERANGE,	__CEPH_OSD_OP(WR, MULTI, 1),	"clonerange")	    \
@@ -358,6 +359,7 @@  static inline int ceph_osd_op_uses_extent(int op)
 	case CEPH_OSD_OP_ZERO:
 	case CEPH_OSD_OP_APPEND:
 	case CEPH_OSD_OP_TRIMTRUNC:
+	case CEPH_OSD_OP_CMPEXT:
 		return true;
 	default:
 		return false;
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 6a6112e..4593929 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3650,6 +3650,32 @@  int ReplicatedPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
   }
 }
 
+int ReplicatedPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
+{
+  ceph_osd_op& op = osd_op.op;
+  vector<OSDOp> read_ops(1);
+  OSDOp& read_op = read_ops[0];
+  int result = 0;
+
+  read_op.op.op = CEPH_OSD_OP_SYNC_READ;
+  read_op.op.extent.offset = op.extent.offset;
+  read_op.op.extent.length = op.extent.length;
+  read_op.op.extent.truncate_seq = op.extent.truncate_seq;
+  read_op.op.extent.truncate_size = op.extent.truncate_size;
+
+  result = do_osd_ops(ctx, read_ops);
+  if (result < 0) {
+    derr << "do_extent_cmp do_osd_ops failed " << result << dendl;
+    return result;
+  }
+
+  if (osd_op.indata.contents_equal(read_op.outdata))
+    return 0;
+
+  osd_op.outdata.claim_append(read_op.outdata);
+  return -EILSEQ;
+}
+
 int ReplicatedPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
 {
   ceph_osd_op& op = osd_op.op;
@@ -4154,6 +4180,11 @@  int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       
       // --- READS ---
 
+    case CEPH_OSD_OP_CMPEXT:
+      tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
+      result = do_extent_cmp(ctx, osd_op);
+      break;
+
     case CEPH_OSD_OP_SYNC_READ:
       if (pool.info.require_rollback()) {
 	result = -EOPNOTSUPP;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 8004d25..adaf8af 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -1430,6 +1430,7 @@  protected:
   int do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr);
   int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr);
 
+  int do_extent_cmp(OpContext *ctx, OSDOp& osd_op);
   int do_writesame(OpContext *ctx, OSDOp& osd_op);
 
   bool pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata);
diff --git a/src/tracing/osd.tp b/src/tracing/osd.tp
index 36ffa7e..e132b61 100644
--- a/src/tracing/osd.tp
+++ b/src/tracing/osd.tp
@@ -91,6 +91,28 @@  TRACEPOINT_EVENT(osd, do_osd_op_pre,
     )
 )
 
+TRACEPOINT_EVENT(osd, do_osd_op_pre_extent_cmp,
+    TP_ARGS(
+        const char*, oid,
+        uint64_t, snap,
+        uint64_t, osize,
+        uint32_t, oseq,
+        uint64_t, offset,
+        uint64_t, length,
+        uint64_t, truncate_size,
+        uint32_t, truncate_seq),
+    TP_FIELDS(
+        ctf_string(oid, oid)
+        ctf_integer(uint64_t, snap, snap)
+        ctf_integer(uint64_t, osize, osize)
+        ctf_integer(uint32_t, oseq, oseq)
+        ctf_integer(uint64_t, offset, offset)
+        ctf_integer(uint64_t, length, length)
+        ctf_integer(uint64_t, truncate_size, truncate_size)
+        ctf_integer(uint32_t, truncate_seq, truncate_seq)
+    )
+)
+
 TRACEPOINT_EVENT(osd, do_osd_op_pre_read,
     TP_ARGS(
         const char*, oid,