From patchwork Wed Jan 31 14:12:19 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jeff Layton <jlayton@poochiereds.net>
X-Patchwork-Id: 10194065
Return-Path: <ceph-devel-owner@kernel.org>
Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org
	[172.30.200.125])
	by pdx-korg-patchwork.web.codeaurora.org (Postfix) with ESMTP id
	5A303603EE for <patchwork-ceph-devel@patchwork.kernel.org>;
	Wed, 31 Jan 2018 14:12:35 +0000 (UTC)
Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1])
	by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 44F712866D
	for <patchwork-ceph-devel@patchwork.kernel.org>;
	Wed, 31 Jan 2018 14:12:35 +0000 (UTC)
Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486)
	id 39FD428672; Wed, 31 Jan 2018 14:12:35 +0000 (UTC)
X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on
	pdx-wl-mail.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-6.9 required=2.0 tests=BAYES_00,RCVD_IN_DNSWL_HI
	autolearn=ham version=3.3.1
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 72DCE2866D
	for <patchwork-ceph-devel@patchwork.kernel.org>;
	Wed, 31 Jan 2018 14:12:34 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1753345AbeAaOMc (ORCPT
	<rfc822;patchwork-ceph-devel@patchwork.kernel.org>);
	Wed, 31 Jan 2018 09:12:32 -0500
Received: from mail.kernel.org ([198.145.29.99]:47304 "EHLO mail.kernel.org"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1753307AbeAaOM2 (ORCPT <rfc822;ceph-devel@vger.kernel.org>);
	Wed, 31 Jan 2018 09:12:28 -0500
Received: from tleilax.poochiereds.net (cpe-71-70-156-158.nc.res.rr.com
	[71.70.156.158])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by mail.kernel.org (Postfix) with ESMTPSA id DB07A2179F;
	Wed, 31 Jan 2018 14:12:26 +0000 (UTC)
DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org DB07A2179F
Authentication-Results: mail.kernel.org; dmarc=none (p=none dis=none)
	header.from=poochiereds.net
Authentication-Results: mail.kernel.org;
	spf=none smtp.mailfrom=jlayton@poochiereds.net
From: jlayton@poochiereds.net
To: nfs-ganesha-devel@lists.sourceforge.net, ceph-devel@vger.kernel.org
Subject: [nfs-ganesha RFC PATCH 6/6] SAL: add new clustered RADOS recovery
	backend
Date: Wed, 31 Jan 2018 09:12:19 -0500
Message-Id: <20180131141219.16929-7-jlayton@poochiereds.net>
X-Mailer: git-send-email 2.14.3
In-Reply-To: <20180131141219.16929-1-jlayton@poochiereds.net>
References: <20180131141219.16929-1-jlayton@poochiereds.net>
Sender: ceph-devel-owner@vger.kernel.org
Precedence: bulk
List-ID: <ceph-devel.vger.kernel.org>
X-Mailing-List: ceph-devel@vger.kernel.org
X-Virus-Scanned: ClamAV using ClamSMTP

From: Jeff Layton <jlayton@redhat.com>

Add a new clustered RADOS recovery backend driver. This uses a common
RADOS object to coordinate a cluster-wide grace period.

Each node in the cluster should be assigned a unique nodeid (uint64_t),
which we then use to check/request a grace period, and determine from
which database to allow recovery.

Change-Id: Ic1ec91f5df7c5cbfa5254c646757b2b29e434dfb
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 src/SAL/CMakeLists.txt                    |   3 +-
 src/SAL/nfs4_recovery.c                   |   2 +
 src/SAL/recovery/recovery_rados_cluster.c | 203 ++++++++++++++++++++++++++++++
 src/include/sal_functions.h               |   1 +
 4 files changed, 208 insertions(+), 1 deletion(-)
 create mode 100644 src/SAL/recovery/recovery_rados_cluster.c

diff --git a/src/SAL/CMakeLists.txt b/src/SAL/CMakeLists.txt
index 115ff04c97ad..8af718949b98 100644
--- a/src/SAL/CMakeLists.txt
+++ b/src/SAL/CMakeLists.txt
@@ -38,6 +38,7 @@ if(USE_RADOS_RECOV)
     ${sal_STAT_SRCS}
     recovery/recovery_rados_kv.c
     recovery/recovery_rados_ng.c
+    recovery/recovery_rados_cluster.c
     )
 endif(USE_RADOS_RECOV)
 
@@ -46,7 +47,7 @@ add_sanitizers(sal)
 
 if(USE_RADOS_RECOV)
   include_directories(${RADOS_INCLUDE_DIR})
-  target_link_libraries(sal ${RADOS_LIBRARIES})
+  target_link_libraries(sal rados_grace ${RADOS_LIBRARIES})
 endif(USE_RADOS_RECOV)
 
 ########### install files ###############
diff --git a/src/SAL/nfs4_recovery.c b/src/SAL/nfs4_recovery.c
index 619352f02575..bcdde4b27b09 100644
--- a/src/SAL/nfs4_recovery.c
+++ b/src/SAL/nfs4_recovery.c
@@ -423,6 +423,8 @@ static int load_backend(const char *name)
 		rados_kv_backend_init(&recovery_backend);
 	else if (!strcmp(name, "rados_ng"))
 		rados_ng_backend_init(&recovery_backend);
+	else if (!strcmp(name, "rados_cluster"))
+		rados_cluster_backend_init(&recovery_backend);
 #endif
 	else if (!strcmp(name, "fs_ng"))
 		fs_ng_backend_init(&recovery_backend);
diff --git a/src/SAL/recovery/recovery_rados_cluster.c b/src/SAL/recovery/recovery_rados_cluster.c
new file mode 100644
index 000000000000..92d7b4fc7383
--- /dev/null
+++ b/src/SAL/recovery/recovery_rados_cluster.c
@@ -0,0 +1,203 @@
+/*
+ * vim:noexpandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright 2017 Red Hat, Inc. and/or its affiliates.
+ * Author: Jeff Layton <jlayton@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * recovery_rados_cluster: a clustered recovery backing store
+ *
+ * We assume that each node has a unique nodeid, with a corresponding slot in
+ * the grace omap, and a rados_kv store for each server epoch.
+ *
+ * When the grace period is started, call into the rados_grace infrastructure
+ * to determine whether we're in a grace period and from what epoch we're
+ * allowed to recover state. Set the proper oid strings, and load the recovery
+ * db if applicable.
+ *
+ * When trying to lift the grace period, we just call into rados_grace
+ * infrastructure and return true or false based on the result.
+ */
+
+#include "config.h"
+#include <netdb.h>
+#include <rados/librados.h>
+#include <rados_grace.h>
+#include "log.h"
+#include "nfs_core.h"
+#include "sal_functions.h"
+#include "recovery_rados.h"
+
+/* FIXME: Make this configurable -- RADOS_KV param? */
+#define RADOS_GRACE_OID			"grace"
+
+static void rados_cluster_init(void)
+{
+	int ret;
+
+	ret = rados_kv_connect(&rados_recov_io_ctx, rados_kv_param.userid,
+			rados_kv_param.ceph_conf, rados_kv_param.pool);
+	if (ret < 0) {
+		LogEvent(COMPONENT_CLIENTID,
+			"Failed to connect to cluster: %d", ret);
+		return;
+	}
+
+	ret = rados_grace_create(rados_recov_io_ctx, RADOS_GRACE_OID);
+	if (ret < 0 && ret != -EEXIST) {
+		LogEvent(COMPONENT_CLIENTID,
+			"Failed to create grace db: %d", ret);
+		rados_kv_shutdown();
+	}
+	return;
+}
+
+/* Try to delete old recovery db */
+static void rados_cluster_cleanup(void)
+{
+	int ret;
+	rados_write_op_t wop;
+
+	if (rados_recov_old_oid[0] == '\0')
+		return;
+
+	wop = rados_create_write_op();
+	rados_write_op_remove(wop);
+	ret = rados_write_op_operate(wop, rados_recov_io_ctx,
+			       rados_recov_old_oid, NULL, 0);
+	if (ret)
+		LogEvent(COMPONENT_CLIENTID, "Failed to remove %s: %d",
+			 rados_recov_old_oid, ret);
+
+	memset(rados_recov_old_oid, '\0', sizeof(rados_recov_old_oid));
+}
+
+static void rados_cluster_read_clids(nfs_grace_start_t *gsp,
+				add_clid_entry_hook add_clid_entry,
+				add_rfh_entry_hook add_rfh_entry)
+{
+	int ret;
+	uint64_t cur, rec;
+	rados_write_op_t wop;
+	struct pop_args args = {
+		.add_clid_entry = add_clid_entry,
+		.add_rfh_entry = add_rfh_entry,
+	};
+
+	if (gsp) {
+		LogEvent(COMPONENT_CLIENTID,
+			 "Clustered rados backend does not support takeover!");
+		return;
+	}
+
+	/* Attempt to join the current grace period */
+	ret = rados_grace_join(rados_recov_io_ctx, RADOS_GRACE_OID,
+			       rados_kv_param.nodeid, &cur, &rec);
+	if (ret) {
+		LogEvent(COMPONENT_CLIENTID,
+			 "Failed to join grace period: %d", ret);
+		return;
+	}
+
+	/*
+	 * Recovery db names are "rec-nnnnnnnn:cccccccccccccccc"
+	 *
+	 * "rec-" followed by nodeid in 8 chars of hex followed by epoch in
+	 * 16 hex digits.
+	 */
+	snprintf(rados_recov_oid, sizeof(rados_recov_oid),
+			"rec-%8.8x:%16.16lx", rados_kv_param.nodeid, cur);
+	wop = rados_create_write_op();
+	rados_write_op_create(wop, LIBRADOS_CREATE_IDEMPOTENT, NULL);
+	rados_write_op_omap_clear(wop);
+	ret = rados_write_op_operate(wop, rados_recov_io_ctx,
+				     rados_recov_oid, NULL, 0);
+	rados_release_write_op(wop);
+	if (ret < 0) {
+		LogEvent(COMPONENT_CLIENTID, "Failed to create recovery db");
+		return;
+	};
+
+	/*
+	 * If we're not in a grace period, then the join failed. No recovery
+	 * allowed.
+	 *
+	 * FIXME: Once cephfs allows us to reclaim earlier cephfs state in a
+	 *        new incarnation of the same client, we can allow recovery
+	 *        from "cur" instead of grace when ceph reclaim succeeds.
+	 *
+	 *        BUT! We also need to fix stable client record creation. They
+	 *        are currently being created during EXCHANGE_ID, but that
+	 *        can lead to records being created for clients that hold no
+	 *        state. In some reboot + network partition situations we could
+	 *        end up allowing reclaim to some clients that should not.
+	 *
+	 *        We need to fix the code to only set a client record for
+	 *        clients that have at least one file open (either via reclaim
+	 *        or new open). We should also remove the record when the
+	 *        client closes its last file.
+	 *
+	 *        This would ensure that the recovery db only has records
+	 *        for clients that held state at the time of the crash.
+	 */
+	if (rec == 0) {
+		LogEvent(COMPONENT_CLIENTID,
+			 "Failed to join grace period: (rec == 0)");
+		return;
+	}
+
+	snprintf(rados_recov_old_oid, sizeof(rados_recov_old_oid),
+			"rec-%8.8x:%16.16lx", rados_kv_param.nodeid, rec);
+	ret = rados_kv_traverse(rados_kv_pop_clid_entry, &args,
+				rados_recov_old_oid);
+	if (ret < 0)
+		LogEvent(COMPONENT_CLIENTID,
+			 "Failed to traverse recovery db: %d", ret);
+}
+
+/* FIXME */
+bool rados_cluster_try_lift_grace(void)
+{
+	int ret;
+	uint64_t cur, rec;
+
+	ret = rados_grace_done(rados_recov_io_ctx, RADOS_GRACE_OID,
+				rados_kv_param.nodeid, &cur, &rec);
+	if (ret) {
+		LogEvent(COMPONENT_CLIENTID,
+			 "Attempt to lift grace failed: %d", ret);
+		return false;
+	}
+
+	/* Non-zero rec means grace is still in force */
+	return (rec == 0);
+}
+
+struct nfs4_recovery_backend rados_cluster_backend = {
+	.recovery_init = rados_cluster_init,
+	.recovery_read_clids = rados_cluster_read_clids,
+	.recovery_cleanup = rados_cluster_cleanup,
+	.add_clid = rados_kv_add_clid,
+	.rm_clid = rados_kv_rm_clid,
+	.add_revoke_fh = rados_kv_add_revoke_fh,
+	.try_lift_grace = rados_cluster_try_lift_grace,
+};
+
+void rados_cluster_backend_init(struct nfs4_recovery_backend **backend)
+{
+	*backend = &rados_cluster_backend;
+}
diff --git a/src/include/sal_functions.h b/src/include/sal_functions.h
index 259d911254a9..57dcc4509546 100644
--- a/src/include/sal_functions.h
+++ b/src/include/sal_functions.h
@@ -1022,6 +1022,7 @@ void fs_ng_backend_init(struct nfs4_recovery_backend **);
 int rados_kv_set_param_from_conf(config_file_t, struct config_error_type *);
 void rados_kv_backend_init(struct nfs4_recovery_backend **);
 void rados_ng_backend_init(struct nfs4_recovery_backend **);
+void rados_cluster_backend_init(struct nfs4_recovery_backend **backend);
 #endif
 
 #endif				/* SAL_FUNCTIONS_H */