diff mbox

Introduce ceph_ectool

Message ID orpp67j2g7.fsf@livre.home (mailing list archive)
State New, archived
Headers show

Commit Message

Alexandre Oliva May 11, 2015, 2:03 a.m. UTC
If a crush rule change wants to move shards of EC pools, and any file
fails to read due to a bad block, the osd will crash, and to avoid
further unexpected failures it must be kept down until other OSDs
recover the PG.

With ceph_ectool, you can recompute the contents of the broken file out
of the object proper, if the PG is active, or out of files extracted
from other active shards, and replace the broken file, so that the OSD
won't crash any more.

I'm aware of the jerasure example program that purports to do the same,
but the erasures generated by it are not compatible with those generated
by ceph, at least not for the cauchy_good erasures in my 4+4 setting.
With ceph_ectool, I have managed to recover some severely degraded
objects and recompute data and erasure codes of various files.

Signed-off-by: Alexandre Oliva <oliva@gnu.org>
---
 src/tools/Makefile.am    |    6 +
 src/tools/ceph_ectool.cc |  247 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 253 insertions(+)
 create mode 100644 src/tools/ceph_ectool.cc
diff mbox

Patch

diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am
index 1a73995..e4ea0d0 100644
--- a/src/tools/Makefile.am
+++ b/src/tools/Makefile.am
@@ -102,6 +102,12 @@  ceph_mon_store_converter_SOURCES = tools/mon_store_converter.cc
 ceph_mon_store_converter_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL)
 bin_PROGRAMS += ceph_mon_store_converter
 
+ceph_ectool_SOURCES = tools/ceph_ectool.cc
+ceph_ectool_CXXFLAGS = ${AM_CXXFLAGS} \
+	-I$(top_srcdir)/src/erasure-code -I$(top_srcdir)/src/osd
+ceph_ectool_LDADD = $(LIBOS) $(LIBOSD_TYPES) $(CEPH_GLOBAL)
+bin_PROGRAMS += ceph_ectool
+
 noinst_HEADERS += \
 	tools/cephfs/JournalTool.h \
 	tools/cephfs/JournalScanner.h \
diff --git a/src/tools/ceph_ectool.cc b/src/tools/ceph_ectool.cc
new file mode 100644
index 0000000..04b3894
--- /dev/null
+++ b/src/tools/ceph_ectool.cc
@@ -0,0 +1,247 @@ 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Alexandre Oliva <oliva@gnu.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "erasure-code/ErasureCodePlugin.h"
+#include "osd/ECUtil.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "common/config.h"
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+int main(int argc, char *argv[]) {
+  if (argc < 3)
+    {
+      cout << "usage: ceph_ectool profile stripe object shard0 shard1..." << endl
+	   << "profile names a file with the output of the command" << endl
+	   << "  ceph osd erasure-code-profile get <profile>" << endl
+	   << "stripe is the stripe width of the pool" << endl
+	   << "object is the full object filename" << endl
+	   << "shard# is the filename of the given shard" << endl
+	   << "files named -- or an empty string are assumed absent" << endl
+	   << "files named but absent are created with decoded data" << endl
+	   << "given object and shards are verified against each other" << endl;
+      return 0;
+    }
+
+  vector<const char *> ceph_options, def_args;
+  global_init(&def_args, ceph_options, CEPH_ENTITY_TYPE_OSD,
+	      CODE_ENVIRONMENT_UTILITY_NODOUT, 0);
+  common_init_finish(g_ceph_context);
+  g_conf = g_ceph_context->_conf;
+  g_conf->set_val_or_die("log_to_stderr", "true");
+  g_conf->set_val_or_die("err_to_stderr", "true");
+  g_conf->apply_changes(NULL);
+
+  map<string,string> profile;
+  {
+    char *pname = argv[1];
+    int fd = open(pname, O_RDONLY);
+    if (fd == -1) {
+      cerr << pname << " not found" << endl;
+      return 1;
+    }
+    struct stat st;
+    memset(&st, 0, sizeof(struct stat));
+    int r = fstat(fd, &st);
+    assert(r == 0);
+    int len = st.st_size;
+    char buf[len];
+    if (read(fd, buf, len) != len) {
+      cerr << pname << ": failed to read " << len << " bytes" << endl;
+      return 1;
+    }
+    for (int i = 0; i < len; i++) {
+      string first;
+      while (i < len && buf[i] != '=' && buf[i] != '\n')
+	first += buf[i++];
+      if (i == len || buf[i] == '\n') {
+	cerr << pname << " does not look like a profile at line " << first << endl;
+	return 1;
+      }
+      assert (buf[i] == '=');
+      i++;
+      string second;
+      while (i < len && buf[i] != '\n')
+	second += buf[i++];
+      if (i < len)
+	assert (buf[i] == '\n');
+      profile[first] = second;
+    }
+    if (!profile.count("plugin")) {
+      cerr << pname << " does not define plugin=<name>" << endl;
+      return 1;
+    }
+  }
+  ErasureCodeInterfaceRef ec_impl;
+  {
+    stringstream ss;
+    ceph::ErasureCodePluginRegistry::instance().
+      factory(profile.find("plugin")->second,
+	      profile, &ec_impl, ss);
+    assert(ec_impl);
+  }
+
+  int k = ec_impl->get_data_chunk_count();
+  int m = ec_impl->get_coding_chunk_count();
+  int stripe_width = atoi (argv[2]);
+
+  if (stripe_width <= 0 || stripe_width % k) {
+    cerr << "stripe must be a positive multiple of " << k << endl;
+    return 1;
+  }
+
+  if (argc != 1 + 2 + 1 + k + m) {
+    cerr << "not enough filenames given: got " << argc - 3
+	 << ", expected " << 1 + k + m << endl;
+    return 1;
+  }
+
+  char **name = argv + 3;
+  bool named[1+k+m];
+  bool present[1+k+m];
+  bufferlist data[1+k+m];
+  int bufsize = 0;
+  int objsize = 0;
+  int size_file = -1;
+  for (int i = 0, a = 2; i <= k + m; i++, a++) {
+    named[i] = present[i] = name[i] && strcmp (name[i], "--");
+    if (named[i]) {
+      int fd = ::open(name[i], O_RDONLY);
+      if (fd < 0)
+	present[i] = false;
+      else {
+	struct stat st;
+	memset(&st, 0, sizeof(struct stat));
+	int r = fstat(fd, &st);
+	assert(r == 0);
+	int len = st.st_size;
+	int xobjsize = len * (i ? k : 1);
+	if (size_file == -1) {
+	  if (xobjsize % stripe_width) {
+	    if (!i)
+	      cerr << "object must be padded to a multiple of "
+		   << stripe_width << " bytes" << endl;
+	    else
+	      cerr << "shard must be padded to be a multiple of "
+		   << stripe_width / k << " bytes" << endl;
+	    return 1;
+	  }
+
+	  bufsize = xobjsize / k;
+	  objsize = xobjsize;
+	  size_file = i;
+	} else if ((i ? bufsize : objsize) != len) {
+	  cerr << name[i] << " has " << len << " bytes, but based on "
+	       << name[size_file] << " we expected "
+	       << (i ? bufsize : objsize) << " bytes" << endl;
+	  return 1;
+	}
+
+	bufferptr bp(len);
+	if (read(fd, bp.c_str(), len) != len) {
+	  cerr << name[i] << ": failed to read " << len << " bytes" << endl;
+	  return 1;
+	}
+
+	close(fd);
+
+	data[i].push_back(bp);
+      }
+    }
+  }
+
+  if (size_file == -1) {
+    cerr << "no given file" << endl;
+    return 1;
+  }
+
+  ECUtil::stripe_info_t sinfo(ec_impl->get_data_chunk_count(), stripe_width);
+
+  if (!present[0]) {
+    map<int, bufferlist> to_decode;
+    map<int, bufferlist*> out;
+    for (int i = 1, s = 0; i <= k + m; i++, s++) {
+      if (!named[i])
+	continue;
+
+      if (present[i])
+	to_decode[s] = data[i];
+      else
+	out[s] = &data[i];
+    }
+
+    if (named[0] && !present[0]) {
+      if (ECUtil::decode(sinfo, ec_impl, to_decode, &data[0]) != 0) {
+	cerr << "reconstruction of object failed" << endl;
+	return 1;
+      }
+
+      int fd = creat(name[0], 0600);
+      if (data[0].write_fd(fd) != 0) {
+	cerr << "failed to write to " << name[0] << endl;
+	return 1;
+      }
+    }
+
+    if (ECUtil::decode(sinfo, ec_impl, to_decode, out) != 0) {
+      cerr << "reconstruction of missing shards failed" << endl;
+      return 1;
+    }
+
+    for (int i = 1, s = 0; i <= k + m; i++, s++) {
+      if (!named[i])
+	continue;
+
+      if (!present[i]) {
+	int fd = creat(name[i], 0600);
+	if (data[i].write_fd(fd) != 0 || close(fd) != 0) {
+	  cerr << "failed to write to " << name[i] << endl;
+	  return 1;
+	}
+      }
+    }
+  } else {
+    set<int> want;
+    map<int, bufferlist> out;
+
+    for (int i = 1, s = 0; i <= k + m; i++, s++) {
+      if (!named[i])
+	continue;
+
+      want.insert(s);
+    }
+
+    if (ECUtil::encode(sinfo, ec_impl, data[0], want, &out) != 0) {
+      cerr << "encoding failed" << endl;
+      return 1;
+    }
+
+    for (int i = 1, s = 0; i <= k + m; i++, s++) {
+      if (!named[i])
+	continue;
+
+      if (!present[i]) {
+	int fd = creat(name[i], 0600);
+	if (out[s].write_fd(fd) != 0 || close(fd) != 0) {
+	  cerr << "failed to write to " << name[i] << endl;
+	  return 1;
+	}
+      } else if (!(data[i] == out[s])) {
+	cerr << name[i] << " deviates from the data encoded from "
+	     << name[0] << endl;
+	return 1;
+      }
+    }
+  }
+}