From patchwork Wed Jul 29 09:25:45 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Mike Christie X-Patchwork-Id: 6891471 Return-Path: X-Original-To: patchwork-ceph-devel@patchwork.kernel.org Delivered-To: patchwork-parsemail@patchwork2.web.kernel.org Received: from mail.kernel.org (mail.kernel.org [198.145.29.136]) by patchwork2.web.kernel.org (Postfix) with ESMTP id 4E1B6C05AC for ; Wed, 29 Jul 2015 09:26:00 +0000 (UTC) Received: from mail.kernel.org (localhost [127.0.0.1]) by mail.kernel.org (Postfix) with ESMTP id 60F40207B9 for ; Wed, 29 Jul 2015 09:25:59 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 58F4A2079C for ; Wed, 29 Jul 2015 09:25:58 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752300AbbG2JZx (ORCPT ); Wed, 29 Jul 2015 05:25:53 -0400 Received: from mx1.redhat.com ([209.132.183.28]:57136 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751243AbbG2JZu (ORCPT ); Wed, 29 Jul 2015 05:25:50 -0400 Received: from int-mx13.intmail.prod.int.phx2.redhat.com (int-mx13.intmail.prod.int.phx2.redhat.com [10.5.11.26]) by mx1.redhat.com (Postfix) with ESMTPS id 221289024B; Wed, 29 Jul 2015 09:25:50 +0000 (UTC) Received: from rh2.redhat.com (vpn-60-109.rdu2.redhat.com [10.10.60.109]) by int-mx13.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id t6T9PmLZ005564; Wed, 29 Jul 2015 05:25:49 -0400 From: mchristi@redhat.com To: ceph-devel@vger.kernel.org, target-devel@vger.kernel.org Subject: [PATCH 1/2] osd: add new extent comparison op Date: Wed, 29 Jul 2015 04:25:45 -0500 Message-Id: <1438161946-28473-2-git-send-email-mchristi@redhat.com> In-Reply-To: <1438161946-28473-1-git-send-email-mchristi@redhat.com> References: <1438161946-28473-1-git-send-email-mchristi@redhat.com> X-Scanned-By: MIMEDefang 2.68 on 10.5.11.26 Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org X-Spam-Status: No, score=-8.3 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_HI, RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=unavailable version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP From: Mike Christie This goes with kernel patch libceph: add support for CMPEXT compare extent requests and rbd: add support for COMPARE_AND_WRITE/CMPEXT This adds support for the CMPEXT request. The request will compare extent.length bytes and compare them to extent.length bytes at extent.offset on disk. If there is a miscompare the osd will return -EILSEQ, the offset in the buffer where it occurred, and the buffer. This op is going to be used for SCSI COMPARE_AND_WRITE support. For this SCSI command, we are required to atomically do the CMPEXT operation and if successful do a WRITE operation. The kernel rbd client is sending those two ops in a multi op request. Note: I am still working on the locking for this operation. Is there a local lock I can take? Signed-off-by: Mike Christie --- src/include/rados.h | 3 +++ src/osd/ReplicatedPG.cc | 46 ++++++++++++++++++++++++++++++++++++++++++++++ src/osd/ReplicatedPG.h | 2 ++ 3 files changed, 51 insertions(+) diff --git a/src/include/rados.h b/src/include/rados.h index 424bef1..025dd3a 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -202,6 +202,8 @@ extern const char *ceph_osd_state_name(int s); /* sync */ \ f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \ \ + f(CMPEXT, __CEPH_OSD_OP(RD, DATA, 31), "cmpext") \ + \ /* write */ \ f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \ f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \ @@ -361,6 +363,7 @@ static inline int ceph_osd_op_uses_extent(int op) case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_APPEND: case CEPH_OSD_OP_TRIMTRUNC: + case CEPH_OSD_OP_CMPEXT: return true; default: return false; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index dcd11f5..2eedcca 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -2999,6 +2999,46 @@ int ReplicatedPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr) } } +int ReplicatedPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op) +{ + ceph_osd_op& op = osd_op.op; + vector read_ops(1); + OSDOp& read_op = read_ops[0]; + int result = 0; + uint64_t mismatch_offset = 0; + + read_op.op.op = CEPH_OSD_OP_SYNC_READ; + read_op.op.extent.offset = op.extent.offset; + read_op.op.extent.length = op.extent.length; + read_op.op.extent.truncate_seq = op.extent.truncate_seq; + read_op.op.extent.truncate_size = op.extent.truncate_size; + + result = do_osd_ops(ctx, read_ops); + if (result < 0) { + derr << "do_extent_cmp do_osd_ops failed " << result << dendl; + return result; + } + + if (read_op.outdata.length() != osd_op.indata.length()) + goto fail; + + for (uint64_t p = 0; p < osd_op.indata.length(); p++) { + if (read_op.outdata[p] != osd_op.indata[p]) { + mismatch_offset = p; + dout(20) << "mismatch at " << p << " read " << read_op.outdata << " sent " << osd_op.indata << dendl; + goto fail; + } + } + + return 0; + +fail: + ::encode(mismatch_offset, osd_op.outdata); + // should this be ::encode(read_op.outdata, osd_op.outdata); + osd_op.outdata.claim_append(read_op.outdata); + return -EILSEQ; +} + // ======================================================================== // low level osd ops @@ -3428,6 +3468,12 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector& ops) // --- READS --- + case CEPH_OSD_OP_CMPEXT: + tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), soid.snap.val, size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq); + // TODO: Locking - this op and the write are supposed to be atomic + result = do_extent_cmp(ctx, osd_op); + break; + case CEPH_OSD_OP_SYNC_READ: if (pool.info.require_rollback()) { result = -EOPNOTSUPP; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 9c28036..f5d61c8 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -1382,6 +1382,8 @@ protected: int do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr); int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr); + int do_extent_cmp(OpContext *ctx, OSDOp& osd_op); + bool pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata); int get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter);