From patchwork Mon May 11 02:03:04 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexandre Oliva X-Patchwork-Id: 6372531 Return-Path: X-Original-To: patchwork-ceph-devel@patchwork.kernel.org Delivered-To: patchwork-parsemail@patchwork1.web.kernel.org Received: from mail.kernel.org (mail.kernel.org [198.145.29.136]) by patchwork1.web.kernel.org (Postfix) with ESMTP id 79E1B9F1C2 for ; Mon, 11 May 2015 02:18:27 +0000 (UTC) Received: from mail.kernel.org (localhost [127.0.0.1]) by mail.kernel.org (Postfix) with ESMTP id 3234620382 for ; Mon, 11 May 2015 02:18:26 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 68DE92037F for ; Mon, 11 May 2015 02:18:24 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752055AbbEKCSW (ORCPT ); Sun, 10 May 2015 22:18:22 -0400 Received: from linux-libre.fsfla.org ([208.118.235.54]:54748 "EHLO linux-libre.fsfla.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751435AbbEKCSW (ORCPT ); Sun, 10 May 2015 22:18:22 -0400 X-Greylist: delayed 878 seconds by postgrey-1.27 at vger.kernel.org; Sun, 10 May 2015 22:18:21 EDT Received: from freie.home (home.lxoliva.fsfla.org [172.31.160.22]) by linux-libre.fsfla.org (8.14.4/8.14.4/Debian-4.1ubuntu1) with ESMTP id t4B23YWo001883 for ; Mon, 11 May 2015 02:03:36 GMT Received: from livre.home (livre.home [172.31.160.2]) by freie.home (8.14.8/8.14.8) with ESMTP id t4B238Rb030776; Sun, 10 May 2015 23:03:10 -0300 From: Alexandre Oliva To: ceph-devel@vger.kernel.org Subject: Introduce ceph_ectool Organization: Free thinker, not speaking for the GNU Project Date: Sun, 10 May 2015 23:03:04 -0300 Message-ID: User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/24.3 (gnu/linux) MIME-Version: 1.0 Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org X-Spam-Status: No, score=-6.9 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_HI, T_RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP If a crush rule change wants to move shards of EC pools, and any file fails to read due to a bad block, the osd will crash, and to avoid further unexpected failures it must be kept down until other OSDs recover the PG. With ceph_ectool, you can recompute the contents of the broken file out of the object proper, if the PG is active, or out of files extracted from other active shards, and replace the broken file, so that the OSD won't crash any more. I'm aware of the jerasure example program that purports to do the same, but the erasures generated by it are not compatible with those generated by ceph, at least not for the cauchy_good erasures in my 4+4 setting. With ceph_ectool, I have managed to recover some severely degraded objects and recompute data and erasure codes of various files. Signed-off-by: Alexandre Oliva --- src/tools/Makefile.am | 6 + src/tools/ceph_ectool.cc | 247 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+) create mode 100644 src/tools/ceph_ectool.cc diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am index 1a73995..e4ea0d0 100644 --- a/src/tools/Makefile.am +++ b/src/tools/Makefile.am @@ -102,6 +102,12 @@ ceph_mon_store_converter_SOURCES = tools/mon_store_converter.cc ceph_mon_store_converter_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL) bin_PROGRAMS += ceph_mon_store_converter +ceph_ectool_SOURCES = tools/ceph_ectool.cc +ceph_ectool_CXXFLAGS = ${AM_CXXFLAGS} \ + -I$(top_srcdir)/src/erasure-code -I$(top_srcdir)/src/osd +ceph_ectool_LDADD = $(LIBOS) $(LIBOSD_TYPES) $(CEPH_GLOBAL) +bin_PROGRAMS += ceph_ectool + noinst_HEADERS += \ tools/cephfs/JournalTool.h \ tools/cephfs/JournalScanner.h \ diff --git a/src/tools/ceph_ectool.cc b/src/tools/ceph_ectool.cc new file mode 100644 index 0000000..04b3894 --- /dev/null +++ b/src/tools/ceph_ectool.cc @@ -0,0 +1,247 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Alexandre Oliva + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "erasure-code/ErasureCodePlugin.h" +#include "osd/ECUtil.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "common/config.h" +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 3) + { + cout << "usage: ceph_ectool profile stripe object shard0 shard1..." << endl + << "profile names a file with the output of the command" << endl + << " ceph osd erasure-code-profile get " << endl + << "stripe is the stripe width of the pool" << endl + << "object is the full object filename" << endl + << "shard# is the filename of the given shard" << endl + << "files named -- or an empty string are assumed absent" << endl + << "files named but absent are created with decoded data" << endl + << "given object and shards are verified against each other" << endl; + return 0; + } + + vector ceph_options, def_args; + global_init(&def_args, ceph_options, CEPH_ENTITY_TYPE_OSD, + CODE_ENVIRONMENT_UTILITY_NODOUT, 0); + common_init_finish(g_ceph_context); + g_conf = g_ceph_context->_conf; + g_conf->set_val_or_die("log_to_stderr", "true"); + g_conf->set_val_or_die("err_to_stderr", "true"); + g_conf->apply_changes(NULL); + + map profile; + { + char *pname = argv[1]; + int fd = open(pname, O_RDONLY); + if (fd == -1) { + cerr << pname << " not found" << endl; + return 1; + } + struct stat st; + memset(&st, 0, sizeof(struct stat)); + int r = fstat(fd, &st); + assert(r == 0); + int len = st.st_size; + char buf[len]; + if (read(fd, buf, len) != len) { + cerr << pname << ": failed to read " << len << " bytes" << endl; + return 1; + } + for (int i = 0; i < len; i++) { + string first; + while (i < len && buf[i] != '=' && buf[i] != '\n') + first += buf[i++]; + if (i == len || buf[i] == '\n') { + cerr << pname << " does not look like a profile at line " << first << endl; + return 1; + } + assert (buf[i] == '='); + i++; + string second; + while (i < len && buf[i] != '\n') + second += buf[i++]; + if (i < len) + assert (buf[i] == '\n'); + profile[first] = second; + } + if (!profile.count("plugin")) { + cerr << pname << " does not define plugin=" << endl; + return 1; + } + } + ErasureCodeInterfaceRef ec_impl; + { + stringstream ss; + ceph::ErasureCodePluginRegistry::instance(). + factory(profile.find("plugin")->second, + profile, &ec_impl, ss); + assert(ec_impl); + } + + int k = ec_impl->get_data_chunk_count(); + int m = ec_impl->get_coding_chunk_count(); + int stripe_width = atoi (argv[2]); + + if (stripe_width <= 0 || stripe_width % k) { + cerr << "stripe must be a positive multiple of " << k << endl; + return 1; + } + + if (argc != 1 + 2 + 1 + k + m) { + cerr << "not enough filenames given: got " << argc - 3 + << ", expected " << 1 + k + m << endl; + return 1; + } + + char **name = argv + 3; + bool named[1+k+m]; + bool present[1+k+m]; + bufferlist data[1+k+m]; + int bufsize = 0; + int objsize = 0; + int size_file = -1; + for (int i = 0, a = 2; i <= k + m; i++, a++) { + named[i] = present[i] = name[i] && strcmp (name[i], "--"); + if (named[i]) { + int fd = ::open(name[i], O_RDONLY); + if (fd < 0) + present[i] = false; + else { + struct stat st; + memset(&st, 0, sizeof(struct stat)); + int r = fstat(fd, &st); + assert(r == 0); + int len = st.st_size; + int xobjsize = len * (i ? k : 1); + if (size_file == -1) { + if (xobjsize % stripe_width) { + if (!i) + cerr << "object must be padded to a multiple of " + << stripe_width << " bytes" << endl; + else + cerr << "shard must be padded to be a multiple of " + << stripe_width / k << " bytes" << endl; + return 1; + } + + bufsize = xobjsize / k; + objsize = xobjsize; + size_file = i; + } else if ((i ? bufsize : objsize) != len) { + cerr << name[i] << " has " << len << " bytes, but based on " + << name[size_file] << " we expected " + << (i ? bufsize : objsize) << " bytes" << endl; + return 1; + } + + bufferptr bp(len); + if (read(fd, bp.c_str(), len) != len) { + cerr << name[i] << ": failed to read " << len << " bytes" << endl; + return 1; + } + + close(fd); + + data[i].push_back(bp); + } + } + } + + if (size_file == -1) { + cerr << "no given file" << endl; + return 1; + } + + ECUtil::stripe_info_t sinfo(ec_impl->get_data_chunk_count(), stripe_width); + + if (!present[0]) { + map to_decode; + map out; + for (int i = 1, s = 0; i <= k + m; i++, s++) { + if (!named[i]) + continue; + + if (present[i]) + to_decode[s] = data[i]; + else + out[s] = &data[i]; + } + + if (named[0] && !present[0]) { + if (ECUtil::decode(sinfo, ec_impl, to_decode, &data[0]) != 0) { + cerr << "reconstruction of object failed" << endl; + return 1; + } + + int fd = creat(name[0], 0600); + if (data[0].write_fd(fd) != 0) { + cerr << "failed to write to " << name[0] << endl; + return 1; + } + } + + if (ECUtil::decode(sinfo, ec_impl, to_decode, out) != 0) { + cerr << "reconstruction of missing shards failed" << endl; + return 1; + } + + for (int i = 1, s = 0; i <= k + m; i++, s++) { + if (!named[i]) + continue; + + if (!present[i]) { + int fd = creat(name[i], 0600); + if (data[i].write_fd(fd) != 0 || close(fd) != 0) { + cerr << "failed to write to " << name[i] << endl; + return 1; + } + } + } + } else { + set want; + map out; + + for (int i = 1, s = 0; i <= k + m; i++, s++) { + if (!named[i]) + continue; + + want.insert(s); + } + + if (ECUtil::encode(sinfo, ec_impl, data[0], want, &out) != 0) { + cerr << "encoding failed" << endl; + return 1; + } + + for (int i = 1, s = 0; i <= k + m; i++, s++) { + if (!named[i]) + continue; + + if (!present[i]) { + int fd = creat(name[i], 0600); + if (out[s].write_fd(fd) != 0 || close(fd) != 0) { + cerr << "failed to write to " << name[i] << endl; + return 1; + } + } else if (!(data[i] == out[s])) { + cerr << name[i] << " deviates from the data encoded from " + << name[0] << endl; + return 1; + } + } + } +}