[nfs-ganesha,RFC,4/6] support: add a rados_grace support library
diff mbox

Message ID 20180131141219.16929-5-jlayton@poochiereds.net
State New
Headers show

Commit Message

Jeff Layton Jan. 31, 2018, 2:12 p.m. UTC
From: Jeff Layton <jlayton@redhat.com>

Add a new support library for coordinating a grace period between
multiple nodes. The basic idea is to use a shared RADOS object to
mediate requests for a grace period.

Typically, each node will perform a read operation on the database
and (then possibly) a write operation. In order to ensure atomicity,
we assert that the object version has not changed between the read
and write ops. If it has, then we start over again from scratch.

Change-Id: Idfff61f8d0092fd9095dc14390dfb434f0ad6696
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 src/include/rados_grace.h  |  33 ++++
 src/support/CMakeLists.txt |   4 +
 src/support/rados_grace.c  | 441 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 478 insertions(+)
 create mode 100644 src/include/rados_grace.h
 create mode 100644 src/support/rados_grace.c

Patch
diff mbox

diff --git a/src/include/rados_grace.h b/src/include/rados_grace.h
new file mode 100644
index 000000000000..ed26a2f071dd
--- /dev/null
+++ b/src/include/rados_grace.h
@@ -0,0 +1,33 @@ 
+#ifndef _RADOS_GRACE_H
+#define _RADOS_GRACE_H
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ * ---------------------------------------
+ */
+int rados_grace_create(rados_ioctx_t io_ctx, const char *oid);
+int rados_grace_dump(rados_ioctx_t io_ctx, const char *oid);
+int rados_grace_start(rados_ioctx_t io_ctx, const char *oid, int nodes,
+			char **nodeids);
+int rados_grace_start(rados_ioctx_t io_ctx, const char *oid, int nodes,
+			char **nodeids);
+int rados_grace_join(rados_ioctx_t io_ctx, const char *oid, uint32_t nodeid,
+			uint64_t *pcur, uint64_t *prec);
+int rados_grace_lift(rados_ioctx_t io_ctx, const char *oid, int nodes,
+			char **nodeids);
+int rados_grace_done(rados_ioctx_t io_ctx, const char *oid, uint32_t nodeid,
+			uint64_t *pcur, uint64_t *prec);
+#endif /* _RADOS_GRACE_H */
diff --git a/src/support/CMakeLists.txt b/src/support/CMakeLists.txt
index 4eb7b828a33f..05364d112350 100644
--- a/src/support/CMakeLists.txt
+++ b/src/support/CMakeLists.txt
@@ -41,6 +41,10 @@  set(netgroup_cache_SRCS
 add_library(netgroup_cache STATIC ${netgroup_cache_SRCS})
 add_sanitizers(netgroup_cache)
 
+if (USE_RADOS_RECOV)
+   add_library(rados_grace STATIC rados_grace.c)
+endif(USE_RADOS_RECOV)
+
 ########### next target ###############
 
 SET(support_STAT_SRCS
diff --git a/src/support/rados_grace.c b/src/support/rados_grace.c
new file mode 100644
index 000000000000..964155d3b186
--- /dev/null
+++ b/src/support/rados_grace.c
@@ -0,0 +1,441 @@ 
+/*
+ * vim:noexpandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright 2017 Red Hat, Inc. and/or its affiliates.
+ * Author: Jeff Layton <jlayton@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * rados-grace: tool for managing coordinated grace period database
+ *
+ * The rados-grace database is a rados object with a well-known name that
+ * with which all cluster nodes can interact to coordinate grace-period
+ * enforcement.
+ *
+ * It consists of two parts:
+ *
+ * 1) 2 uint64_t epoch values (stored LE) that indicate the serial number of
+ * the current grace period (C) and the serial number of the grace period that
+ * from which recovery is currently allowed (R). These are stored as object
+ * data.
+ *
+ * 2) An omap containing a key for each node that currently requires a grace
+ * period.
+ *
+ * Consider a single server epoch (E) of an individual NFS server to be the
+ * period between reboots. That consists of an initial grace period and
+ * a regular operation period. An epoch value of 0 is never valid.
+ *
+ * The first value (C) indicates the current server epoch. The client recovery
+ * db should be tagged with this value on creation, or when updating the db
+ * after the grace period has been fully lifted.
+ *
+ * The second uint64_t value in the data tells the NFS server from what
+ * recovery db it is allowed to reclaim. A value of 0 in this field means that
+ * we are out of the cluster-wide grace period and that no recovery is allowed.
+ *
+ * The cluster manager (or sentient administrator) begins a new grace period by
+ * passing in a number of nodes as an initial set. If the current recovery
+ * serial number is set to 0, then we'll copy the current value to the recovery
+ * serial number, and increment the current value by 1. At that point, a new
+ * epoch is established and the cluster-wide grace period begins.
+ *
+ * As nodes come up, we must decide whether to allow NFS reclaim and what
+ * epoch's database the server should use. This requires 2 inputs:
+ *
+ * 1) whether we were successful in reclaiming the cephfs state of a previous
+ *    instance of this ganesha's ceph client.
+ *
+ * 2) whether we're currently in a cluster-wide grace period.
+ *
+ * If the cephfs reclaim was successful and we are in a grace period, then
+ * NFS reclaim should be allowed from the current reclaim epoch (R). If
+ * cephfs reclaim was successful and we are not in a grace period, then NFS
+ * reclaim is allowed for the current epoch (C).
+ *
+ * If the cephfs reclaim is not successful and we are not in a grace period,
+ * then no NFS reclaim is allowed. If cephfs reclaim is not successful and we
+ * are in a grace period, then we allow reclaim for epoch (R).
+ *
+ * Each server comes up, and first checks whether a cluster-wide grace period
+ * is in force. If it is, then it sets its own grace period request flag (if
+ * necessary) and then begins recovery according to the rules above.
+ *
+ * As each node completes its own recovery, it clears its flag in the omap. The
+ * node that clears the last flag will then lift the grace period fully by
+ * setting the reclaim epoch R to 0. Eventually other nodes will poll the
+ * grace object, notice that the grace period is lifted and will transition
+ * to normal operation.
+ */
+#include "config.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <endian.h>
+#include <rados/librados.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdbool.h>
+
+#define MAX_ITEMS			1024
+
+int
+rados_grace_create(rados_ioctx_t io_ctx, const char *oid)
+{
+	int			ret;
+	rados_write_op_t	op = NULL;
+	uint64_t		cur = htole64(1);	// starting epoch
+	uint64_t		rec = htole64(0);	// no recovery yet
+	char			buf[sizeof(uint64_t) * 2];
+
+	/*
+	 * 2 uint64_t's
+	 *
+	 * The first denotes the current epoch serial number, the epoch serial
+	 * number under which new recovery records should be created.
+	 *
+	 * The second number denotes the epoch from which clients are allowed
+	 * to reclaim.
+	 *
+	 * An epoch of zero is never allowed, so if rec=0, then the grace
+	 * period is no longer in effect, and can't be joined.
+	 */
+	memcpy(buf, (char *)&cur, sizeof(cur));
+	memcpy(buf + sizeof(cur), (char *)&rec, sizeof(rec));
+
+	op = rados_create_write_op();
+	/* Create the object */
+	rados_write_op_create(op, LIBRADOS_CREATE_EXCLUSIVE, NULL);
+
+	/* Set serial numbers if we created the object */
+	rados_write_op_write_full(op, buf, sizeof(buf));
+
+	ret = rados_write_op_operate(op, io_ctx, oid, NULL, 0);
+	rados_release_write_op(op);
+	return ret;
+}
+
+int
+rados_grace_dump(rados_ioctx_t io_ctx, const char *oid)
+{
+	int ret;
+	rados_omap_iter_t iter;
+	rados_read_op_t op;
+	char *key_out = NULL;
+	char *val_out = NULL;
+	unsigned char more = '\0';
+	size_t len_out = 0;
+	char buf[sizeof(uint64_t) * 2];
+	uint64_t	cur, rec;
+
+	op = rados_create_read_op();
+	rados_read_op_read(op, 0, sizeof(buf), buf, &len_out, NULL);
+	rados_read_op_omap_get_keys2(op, "", MAX_ITEMS, &iter, &more, NULL);
+	ret = rados_read_op_operate(op, io_ctx, oid, 0);
+	if (ret < 0) {
+		printf("%s: ret=%d", __func__, ret);
+		goto out;
+	}
+
+	if (len_out != sizeof(buf)) {
+		ret = -ENOTRECOVERABLE;
+		goto out;
+	}
+
+	if (more) {
+		ret = -ENOTRECOVERABLE;
+		goto out;
+	}
+
+	cur = le64toh(*(uint64_t *)buf);
+	rec = le64toh(*(uint64_t *)(buf + sizeof(uint64_t)));
+	printf("cur=%lu rec=%lu\n", cur, rec);
+	for (;;) {
+		rados_omap_get_next(iter, &key_out, &val_out, &len_out);
+		if (key_out == NULL)
+			break;
+		printf("%s ", key_out);
+	}
+	printf("\n");
+out:
+	rados_release_read_op(op);
+	return ret;
+}
+
+static int
+__rados_grace_start(rados_ioctx_t io_ctx, const char *oid, int nodes,
+			const char **nodeids, uint64_t *pcur, uint64_t *prec,
+			bool start)
+{
+	int			ret;
+	char			buf[sizeof(uint64_t) * 2];
+	char			**vals = NULL;
+	size_t			*lens = NULL;
+	size_t			len = 0;
+	uint64_t		cur, rec, ver;
+
+	vals = calloc(nodes, sizeof(char *));
+	if (!vals) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	lens = calloc(nodes, sizeof(size_t));
+	if (!lens) {
+		ret = -ENOMEM;
+		free(vals);
+		goto out;
+	}
+
+	do {
+		rados_write_op_t	wop;
+		rados_read_op_t		rop;
+
+		/* read epoch blob */
+		rop = rados_create_read_op();
+		rados_read_op_read(rop, 0, sizeof(buf), buf, &len, NULL);
+		ret = rados_read_op_operate(rop, io_ctx, oid, 0);
+		rados_release_read_op(rop);
+		if (ret < 0)
+			break;
+
+		if (len != sizeof(buf)) {
+			ret = -ENOTRECOVERABLE;
+			break;
+		}
+
+		/* Get old epoch numbers and version */
+		cur = le64toh(*(uint64_t *)buf);
+		rec = le64toh(*(uint64_t *)(buf + sizeof(uint64_t)));
+		ver = rados_get_last_version(io_ctx);
+
+		/* Only start a new grace period if force flag is set */
+		if (rec == 0 && !start)
+			break;
+
+		/* Attempt to update object */
+		wop = rados_create_write_op();
+
+		/* Ensure that nothing has changed */
+		rados_write_op_assert_version(wop, ver);
+
+		/* Update the object iff rec == 0 */
+		if (rec == 0) {
+			uint64_t tc, tr;
+
+			rec = cur;
+			++cur;
+			tc = htole64(cur);
+			tr = htole64(rec);
+			memcpy(buf, (char *)&tc, sizeof(tc));
+			memcpy(buf + sizeof(tc), (char *)&tr, sizeof(tr));
+			rados_write_op_write_full(wop, buf, sizeof(buf));
+		}
+
+		/* Set omap values to given ones */
+		rados_write_op_omap_set(wop, nodeids,
+				(const char * const*)vals, lens, nodes);
+
+		ret = rados_write_op_operate(wop, io_ctx, oid, NULL, 0);
+		rados_release_write_op(wop);
+	} while (ret == -ERANGE);
+
+	if (!ret) {
+		*pcur = cur;
+		*prec = rec;
+	}
+out:
+	free(lens);
+	free(vals);
+	return ret;
+}
+
+int
+rados_grace_start(rados_ioctx_t io_ctx, const char *oid, int nodes,
+			const char **nodeids)
+{
+	uint64_t	cur, rec;
+
+	return __rados_grace_start(io_ctx, oid, nodes, nodeids, &cur,
+					&rec, true);
+}
+
+int
+rados_grace_join(rados_ioctx_t io_ctx, const char *oid, uint32_t nodeid,
+			uint64_t *pcur, uint64_t *prec)
+{
+	char		buf[11];
+	const char	*nodeids[1];
+
+	if (nodeid == UINT32_MAX)
+		return -EINVAL;
+
+	snprintf(buf, sizeof(buf), "%u", nodeid);
+	nodeids[0] = buf;
+	return __rados_grace_start(io_ctx, oid, 1, nodeids, pcur, prec, false);
+}
+
+static int
+__rados_grace_lift(rados_ioctx_t io_ctx, const char *oid, int nodes,
+			const char **nodeids, uint64_t *pcur, uint64_t *prec)
+{
+	int			ret;
+	const char		**keys = NULL;
+	uint64_t		cur, rec;
+
+	keys = calloc(nodes, sizeof(char *));
+	if (!keys) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	do {
+		int			i, k, cnt;
+		rados_write_op_t	wop;
+		rados_read_op_t		rop;
+		rados_omap_iter_t	iter;
+		char			*key, *val;
+		size_t			len;
+		char			buf[sizeof(uint64_t) * 2];
+		unsigned char		more = '\0';
+		uint64_t		ver;
+
+		/* read epoch blob */
+		rop = rados_create_read_op();
+		rados_read_op_read(rop, 0, sizeof(buf), buf, &len, NULL);
+		rados_read_op_omap_get_keys2(rop, "", MAX_ITEMS, &iter,
+						&more, NULL);
+		ret = rados_read_op_operate(rop, io_ctx, oid, 0);
+		if (ret < 0) {
+			rados_release_read_op(rop);
+			break;
+		}
+
+		if (more) {
+			ret = -ENOTRECOVERABLE;
+			break;
+		}
+
+		if (len != sizeof(buf)) {
+			ret = -ENOTRECOVERABLE;
+			break;
+		}
+
+		/* Get old epoch numbers and version, iff rec == 0 */
+		cur = le64toh(*(uint64_t *)buf);
+		rec = le64toh(*(uint64_t *)(buf + sizeof(uint64_t)));
+		ver = rados_get_last_version(io_ctx);
+		rados_release_read_op(rop);
+
+		/*
+		 * If we're not in grace period, then there should be no
+		 * records in the omap. Either way, we don't want to alter
+		 * anything in this case.
+		 */
+		if (rec == 0) {
+			ret = rados_omap_get_next(iter, &key, &val, &len);
+			if (ret < 0)
+				break;
+			if (key != NULL) {
+				ret = -ENOTRECOVERABLE;
+			}
+			break;
+		}
+
+		/*
+		 * Walk omap keys, see if it's in nodeids array.
+		 * Add any that are to the "keys" array.
+		 */
+		cnt = 0;
+		k = 0;
+		for (;;) {
+			ret = rados_omap_get_next(iter, &key, &val, &len);
+			if (!key)
+				break;
+			++cnt;
+			for (i = 0; i < nodes; ++i) {
+				if (strcmp(key, nodeids[i]))
+					continue;
+				keys[k] = nodeids[i];
+				++k;
+			}
+		};
+
+		/* No matching keys? Nothing to do. */
+		if (k == 0)
+			break;
+
+		/* Attempt to update object */
+		wop = rados_create_write_op();
+
+		/* Ensure that nothing has changed */
+		rados_write_op_assert_version(wop, ver);
+
+		/* Remove keys from array */
+		rados_write_op_omap_rm_keys(wop, (const char * const*)keys, k);
+
+		/*
+		 * If number of omap records we're removing == number of keys,
+		 * then fully lift the grace period.
+		 */
+		if (cnt == k) {
+			uint64_t tc, tr;
+
+			rec = 0;
+			tr = htole64(rec);
+			tc = htole64(cur);
+			memcpy(buf, (char *)&tc, sizeof(tc));
+			memcpy(buf + sizeof(tc), (char *)&tr, sizeof(tr));
+			rados_write_op_write_full(wop, buf, sizeof(buf));
+		}
+
+		ret = rados_write_op_operate(wop, io_ctx, oid, NULL, 0);
+		rados_release_write_op(wop);
+	} while (ret == -ERANGE);
+	if (!ret) {
+		*pcur = cur;
+		*prec = rec;
+	}
+out:
+	free(keys);
+	return ret;
+}
+
+int
+rados_grace_lift(rados_ioctx_t io_ctx, const char *oid, int nodes,
+			const char **nodeids)
+{
+	uint64_t	cur, rec;
+
+	return __rados_grace_lift(io_ctx, oid, nodes, nodeids, &cur,
+					&rec);
+}
+
+int
+rados_grace_done(rados_ioctx_t io_ctx, const char *oid, uint32_t nodeid,
+			uint64_t *pcur, uint64_t *prec)
+{
+	char		buf[11];
+	const char	*nodeids[1];
+
+	if (nodeid == UINT32_MAX)
+		return -EINVAL;
+
+	snprintf(buf, sizeof(buf), "%u", nodeid);
+	nodeids[0] = buf;
+	return __rados_grace_lift(io_ctx, oid, 1, nodeids, pcur, prec);
+}