@@ -12,9 +12,9 @@ LTCOMMAND = xfs_scrub
INSTALL_SCRUB = install-scrub
endif # scrub_prereqs
-HFILES = scrub.h ../repair/threads.h read_verify.h iocmd.h
+HFILES = scrub.h ../repair/threads.h read_verify.h iocmd.h xfs_ioctl.h
CFILES = ../repair/avl64.c disk.c bitmap.c iocmd.c \
- read_verify.c scrub.c ../repair/threads.c
+ read_verify.c scrub.c ../repair/threads.c xfs.c xfs_ioctl.c
LLDLIBS += $(LIBBLKID) $(LIBXFS) $(LIBXCMD) $(LIBUUID) $(LIBRT) $(LIBPTHREAD) $(LIBHANDLE)
LTDEPENDENCIES += $(LIBXFS) $(LIBXCMD) $(LIBHANDLE)
@@ -638,6 +638,9 @@ _("Must be root to run scrub."));
ctx->nr_io_threads = disk_heads(&ctx->datadev);
else
ctx->nr_io_threads = libxfs_nproc();
+ moveon = xfs_scan_fs(ctx);
+ if (!moveon)
+ goto out;
if (verbose) {
fprintf(stdout, _("%s: using %d threads to scrub.\n"),
ctx->mntpoint, scrub_nproc(ctx));
@@ -664,7 +667,7 @@ _("Errors found, please re-run with -y."));
return true;
}
- return false;
+ return xfs_repair_fs(ctx);
}
/* Run all the phases of the scrubber. */
@@ -676,11 +679,11 @@ run_scrub_phases(
{
struct scrub_phase phases[] = {
{_("Find filesystem geometry."), find_geo},
- {_("Check internal metadata."), NULL},
- {_("Scan all inodes."), NULL},
+ {_("Check internal metadata."), xfs_scan_metadata},
+ {_("Scan all inodes."), xfs_scan_inodes},
{NULL, REPAIR_DUMMY_FN},
{_("Verify data file integrity."), DATASCAN_DUMMY_FN},
- {_("Check summary counters."), NULL},
+ {_("Check summary counters."), xfs_check_summary},
{NULL, NULL},
};
struct phase_info pi;
@@ -698,9 +701,10 @@ run_scrub_phases(
phase->fn = preen;
} else if (ctx->mode == SCRUB_MODE_REPAIR) {
phase->descr = _("Repair filesystem.");
+ phase->fn = xfs_repair_fs;
}
} else if (phase->fn == DATASCAN_DUMMY_FN && scrub_data)
- ;
+ phase->fn = xfs_scan_blocks;
if (phase->fn == REPAIR_DUMMY_FN ||
phase->fn == DATASCAN_DUMMY_FN) {
@@ -906,6 +910,11 @@ _("Only one of the options -n or -y may be specified.\n"));
if (!moveon)
ret |= 4;
+ /* Clean up scan data. */
+ moveon = xfs_cleanup(&ctx);
+ if (!moveon)
+ ret |= 8;
+
if (ctx.repairs && ctx.preens)
fprintf(stdout,
_("%s: %lu repairs and %lu optimizations made.\n"),
@@ -932,6 +941,8 @@ _("%s: %lu errors found. Unmount and run xfs_repair.\n"),
_("%s: %lu warnings found.\n"),
ctx.mntpoint, ctx.warnings_found);
if (ctx.errors_found) {
+ if (error_action == ERRORS_SHUTDOWN)
+ xfs_shutdown_fs(&ctx);
ret |= 1;
}
if (ctx.warnings_found) {
@@ -56,6 +56,22 @@ struct scrub_ctx {
unsigned long warnings_found;
unsigned long repairs;
unsigned long preens;
+
+ /* FS specific stuff */
+ struct xfs_fsop_geom geo;
+ struct fs_path fsinfo;
+ unsigned int agblklog;
+ unsigned int blocklog;
+ unsigned int inodelog;
+ unsigned int inopblog;
+ struct disk logdev;
+ struct disk rtdev;
+ void *fshandle;
+ size_t fshandle_len;
+ unsigned long long capabilities; /* see below */
+ struct read_verify_pool rvp;
+ struct list_head repair_list;
+ bool preen_triggers[XFS_SCRUB_TYPE_MAX + 1];
};
enum errors_action {
@@ -124,4 +140,14 @@ static inline int syncfs(int fd)
}
#endif
+/* FS-specific functions */
+bool xfs_cleanup(struct scrub_ctx *ctx);
+bool xfs_scan_fs(struct scrub_ctx *ctx);
+bool xfs_scan_inodes(struct scrub_ctx *ctx);
+bool xfs_scan_metadata(struct scrub_ctx *ctx);
+bool xfs_check_summary(struct scrub_ctx *ctx);
+bool xfs_scan_blocks(struct scrub_ctx *ctx);
+bool xfs_repair_fs(struct scrub_ctx *ctx);
+void xfs_shutdown_fs(struct scrub_ctx *ctx);
+
#endif /* SCRUB_H_ */
new file mode 100644
@@ -0,0 +1,1517 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "libxfs.h"
+#include <sys/statvfs.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <attr/attributes.h>
+#include "disk.h"
+#include "../repair/threads.h"
+#include "handle.h"
+#include "path.h"
+#include "read_verify.h"
+#include "bitmap.h"
+#include "iocmd.h"
+#include "scrub.h"
+#include "xfs_ioctl.h"
+#include "xfs_fs.h"
+
+/*
+ * XFS Scrubbing Strategy
+ *
+ * The XFS scrubber uses custom XFS ioctls to probe more deeply into the
+ * internals of the filesystem. It takes advantage of scrubbing ioctls
+ * to check all the records stored in a metadata btree and to
+ * cross-reference those records against the other metadata btrees.
+ *
+ * The "find geometry" phase queries XFS for the filesystem geometry.
+ * The block devices for the data, realtime, and log devices are opened.
+ * Kernel ioctls are queried to see if they are implemented, and a data
+ * file read-verify strategy is selected.
+ *
+ * In the "check internal metadata" phase, we call the SCRUB_METADATA
+ * ioctl to check the filesystem's internal per-AG btrees. This
+ * includes the AG superblock, AGF, AGFL, and AGI headers, freespace
+ * btrees, the regular and free inode btrees, the reverse mapping
+ * btrees, and the reference counting btrees. If the realtime device is
+ * enabled, the realtime bitmap and reverse mapping btrees are enabled.
+ * Each AG (and the realtime device) has its metadata checked in a
+ * separate thread for better performance.
+ *
+ * The "scan inodes" phase uses BULKSTAT to scan all the inodes in an
+ * AG in disk order. From the BULKSTAT information, a file handle is
+ * constructed and the following items are checked:
+ *
+ * - If it's a symlink, the target is read but not validated.
+ * - Bulkstat data is checked.
+ * - If the inode is a file or a directory, a file descriptor is
+ * opened to pin the inode and for further analysis.
+ * - Extended attribute names and values are read via the file
+ * handle. If this fails and we have a file descriptor open, we
+ * retry with the generic extended attribute APIs.
+ * - If the inode is not a file or directory, we're done.
+ * - Extent maps are scanned to ensure that the records make sense.
+ * We also use the SCRUB_METADATA ioctl for better checking of the
+ * block mapping records.
+ * - If the inode is a directory, open the directory and check that
+ * the dirent type code and inode numbers match the stat output.
+ *
+ * Multiple threads are started to check each the inodes of each AG in
+ * parallel.
+ *
+ * In the "verify data file integrity" phase, we employ GETFSMAP to read
+ * the reverse-mappings of all AGs and issue direct-reads of the
+ * underlying disk blocks. We rely on the underlying storage to have
+ * checksummed the data blocks appropriately.
+ *
+ * Multiple threads are started to check each AG in parallel. A
+ * separate thread pool is used to handle the direct reads.
+ *
+ * In the "check summary counters" phase, use GETFSMAP to tally up the
+ * blocks and BULKSTAT to tally up the inodes we saw and compare that to
+ * the statfs output. This gives the user a rough estimate of how
+ * thorough the scrub was.
+ */
+
+/* Routines to scrub an XFS filesystem. */
+
+#define XFS_SCRUB_CAP_PARENT_PTR (1ULL << 0) /* can find parent? */
+
+#define XFS_SCRUB_CAPABILITY_FUNCS(name, flagname) \
+static inline bool \
+xfs_scrub_can_##name(struct scrub_ctx *ctx) \
+{ \
+ return ctx->capabilities & XFS_SCRUB_CAP_##flagname; \
+} \
+static inline void \
+xfs_scrub_set_##name(struct scrub_ctx *ctx) \
+{ \
+ ctx->capabilities |= XFS_SCRUB_CAP_##flagname; \
+} \
+static inline void \
+xfs_scrub_clear_##name(struct scrub_ctx *ctx) \
+{ \
+ ctx->capabilities &= ~(XFS_SCRUB_CAP_##flagname); \
+}
+XFS_SCRUB_CAPABILITY_FUNCS(getparent, PARENT_PTR)
+
+/* Find the fd for a given device identifier. */
+static struct disk *
+xfs_dev_to_disk(
+ struct scrub_ctx *ctx,
+ dev_t dev)
+{
+ if (dev == ctx->fsinfo.fs_datadev)
+ return &ctx->datadev;
+ else if (dev == ctx->fsinfo.fs_logdev)
+ return &ctx->logdev;
+ else if (dev == ctx->fsinfo.fs_rtdev)
+ return &ctx->rtdev;
+ abort();
+}
+
+/* Find the device major/minor for a given file descriptor. */
+static dev_t
+xfs_disk_to_dev(
+ struct scrub_ctx *ctx,
+ struct disk *disk)
+{
+ if (disk == &ctx->datadev)
+ return ctx->fsinfo.fs_datadev;
+ else if (disk == &ctx->logdev)
+ return ctx->fsinfo.fs_logdev;
+ else if (disk == &ctx->rtdev)
+ return ctx->fsinfo.fs_rtdev;
+ abort();
+}
+
+/* Shortcut to creating a read-verify thread pool. */
+static inline bool
+xfs_read_verify_pool_init(
+ struct scrub_ctx *ctx,
+ read_verify_ioend_fn_t ioend_fn)
+{
+ return read_verify_pool_init(&ctx->rvp, ctx, ctx->readbuf,
+ IO_MAX_SIZE, ctx->geo.blocksize, ioend_fn,
+ disk_heads(&ctx->datadev));
+}
+
+struct owner_decode {
+ uint64_t owner;
+ const char *descr;
+};
+
+static const struct owner_decode special_owners[] = {
+ {XFS_FMR_OWN_FREE, "free space"},
+ {XFS_FMR_OWN_UNKNOWN, "unknown owner"},
+ {XFS_FMR_OWN_FS, "static FS metadata"},
+ {XFS_FMR_OWN_LOG, "journalling log"},
+ {XFS_FMR_OWN_AG, "per-AG metadata"},
+ {XFS_FMR_OWN_INOBT, "inode btree blocks"},
+ {XFS_FMR_OWN_INODES, "inodes"},
+ {XFS_FMR_OWN_REFC, "refcount btree"},
+ {XFS_FMR_OWN_COW, "CoW staging"},
+ {XFS_FMR_OWN_DEFECTIVE, "bad blocks"},
+ {0, NULL},
+};
+
+/* Decode a special owner. */
+static const char *
+xfs_decode_special_owner(
+ uint64_t owner)
+{
+ const struct owner_decode *od = special_owners;
+
+ while (od->descr) {
+ if (od->owner == owner)
+ return od->descr;
+ od++;
+ }
+
+ return NULL;
+}
+
+/* BULKSTAT wrapper routines. */
+struct xfs_scan_inodes {
+ xfs_inode_iter_fn fn;
+ void *arg;
+ size_t array_arg_size;
+ bool moveon;
+};
+
+/* Scan all the inodes in an AG. */
+static void
+xfs_scan_ag_inodes(
+ struct work_queue *wq,
+ xfs_agnumber_t agno,
+ void *arg)
+{
+ struct xfs_scan_inodes *si = arg;
+ struct scrub_ctx *ctx = (struct scrub_ctx *)wq->mp;
+ void *fn_arg;
+ char descr[DESCR_BUFSZ];
+ uint64_t ag_ino;
+ uint64_t next_ag_ino;
+ bool moveon;
+
+ snprintf(descr, DESCR_BUFSZ, _("dev %d:%d AG %u inodes"),
+ major(ctx->fsinfo.fs_datadev),
+ minor(ctx->fsinfo.fs_datadev),
+ agno);
+
+ ag_ino = (__u64)agno << (ctx->inopblog + ctx->agblklog);
+ next_ag_ino = (__u64)(agno + 1) << (ctx->inopblog + ctx->agblklog);
+
+ fn_arg = ((char *)si->arg) + si->array_arg_size * agno;
+ moveon = xfs_iterate_inodes(ctx, descr, ctx->fshandle, ag_ino,
+ next_ag_ino - 1, si->fn, fn_arg);
+ if (!moveon)
+ si->moveon = false;
+}
+
+/* How many array elements should we create to scan all the inodes? */
+static inline size_t
+xfs_scan_all_inodes_array_size(
+ struct scrub_ctx *ctx)
+{
+ return ctx->geo.agcount;
+}
+
+/* Scan all the inodes in a filesystem. */
+static bool
+xfs_scan_all_inodes_array_arg(
+ struct scrub_ctx *ctx,
+ xfs_inode_iter_fn fn,
+ void *arg,
+ size_t array_arg_size)
+{
+ struct xfs_scan_inodes si;
+ xfs_agnumber_t agno;
+ struct work_queue wq;
+
+ si.moveon = true;
+ si.fn = fn;
+ si.arg = arg;
+ si.array_arg_size = array_arg_size;
+
+ create_work_queue(&wq, (struct xfs_mount *)ctx, scrub_nproc(ctx));
+ for (agno = 0; agno < ctx->geo.agcount; agno++)
+ queue_work(&wq, xfs_scan_ag_inodes, agno, &si);
+ destroy_work_queue(&wq);
+
+ return si.moveon;
+}
+#define xfs_scan_all_inodes(ctx, fn) \
+ xfs_scan_all_inodes_array_arg((ctx), (fn), NULL, 0)
+#define xfs_scan_all_inodes_arg(ctx, fn, arg) \
+ xfs_scan_all_inodes_array_arg((ctx), (fn), (arg), 0)
+
+/* GETFSMAP wrappers routines. */
+struct xfs_scan_blocks {
+ xfs_fsmap_iter_fn fn;
+ void *arg;
+ size_t array_arg_size;
+ bool moveon;
+};
+
+/* Iterate all the reverse mappings of an AG. */
+static void
+xfs_scan_ag_blocks(
+ struct work_queue *wq,
+ xfs_agnumber_t agno,
+ void *arg)
+{
+ struct scrub_ctx *ctx = (struct scrub_ctx *)wq->mp;
+ struct xfs_scan_blocks *sbx = arg;
+ void *fn_arg;
+ char descr[DESCR_BUFSZ];
+ struct fsmap keys[2];
+ off64_t bperag;
+ bool moveon;
+
+ bperag = (off64_t)ctx->geo.agblocks *
+ (off64_t)ctx->geo.blocksize;
+
+ snprintf(descr, DESCR_BUFSZ, _("dev %d:%d AG %u fsmap"),
+ major(ctx->fsinfo.fs_datadev),
+ minor(ctx->fsinfo.fs_datadev),
+ agno);
+
+ memset(keys, 0, sizeof(struct fsmap) * 2);
+ keys->fmr_device = ctx->fsinfo.fs_datadev;
+ keys->fmr_physical = agno * bperag;
+ (keys + 1)->fmr_device = ctx->fsinfo.fs_datadev;
+ (keys + 1)->fmr_physical = ((agno + 1) * bperag) - 1;
+ (keys + 1)->fmr_owner = ULLONG_MAX;
+ (keys + 1)->fmr_offset = ULLONG_MAX;
+ (keys + 1)->fmr_flags = UINT_MAX;
+
+ fn_arg = ((char *)sbx->arg) + sbx->array_arg_size * agno;
+ moveon = xfs_iterate_fsmap(ctx, descr, keys, sbx->fn, fn_arg);
+ if (!moveon)
+ sbx->moveon = false;
+}
+
+/* Iterate all the reverse mappings of a standalone device. */
+static void
+xfs_scan_dev_blocks(
+ struct scrub_ctx *ctx,
+ int idx,
+ dev_t dev,
+ struct xfs_scan_blocks *sbx)
+{
+ struct fsmap keys[2];
+ char descr[DESCR_BUFSZ];
+ void *fn_arg;
+ bool moveon;
+
+ snprintf(descr, DESCR_BUFSZ, _("dev %d:%d fsmap"),
+ major(dev), minor(dev));
+
+ memset(keys, 0, sizeof(struct fsmap) * 2);
+ keys->fmr_device = dev;
+ (keys + 1)->fmr_device = dev;
+ (keys + 1)->fmr_physical = ULLONG_MAX;
+ (keys + 1)->fmr_owner = ULLONG_MAX;
+ (keys + 1)->fmr_offset = ULLONG_MAX;
+ (keys + 1)->fmr_flags = UINT_MAX;
+
+ fn_arg = ((char *)sbx->arg) + sbx->array_arg_size * idx;
+ moveon = xfs_iterate_fsmap(ctx, descr, keys, sbx->fn, fn_arg);
+ if (!moveon)
+ sbx->moveon = false;
+}
+
+/* Iterate all the reverse mappings of the realtime device. */
+static void
+xfs_scan_rt_blocks(
+ struct work_queue *wq,
+ xfs_agnumber_t agno,
+ void *arg)
+{
+ struct scrub_ctx *ctx = (struct scrub_ctx *)wq->mp;
+
+ xfs_scan_dev_blocks(ctx, agno, ctx->fsinfo.fs_rtdev, arg);
+}
+
+/* Iterate all the reverse mappings of the log device. */
+static void
+xfs_scan_log_blocks(
+ struct work_queue *wq,
+ xfs_agnumber_t agno,
+ void *arg)
+{
+ struct scrub_ctx *ctx = (struct scrub_ctx *)wq->mp;
+
+ xfs_scan_dev_blocks(ctx, agno, ctx->fsinfo.fs_logdev, arg);
+}
+
+/* How many array elements should we create to scan all the blocks? */
+static size_t
+xfs_scan_all_blocks_array_size(
+ struct scrub_ctx *ctx)
+{
+ return ctx->geo.agcount + 2;
+}
+
+/* Scan all the blocks in a filesystem. */
+static bool
+xfs_scan_all_blocks_array_arg(
+ struct scrub_ctx *ctx,
+ xfs_fsmap_iter_fn fn,
+ void *arg,
+ size_t array_arg_size)
+{
+ xfs_agnumber_t agno;
+ struct work_queue wq;
+ struct xfs_scan_blocks sbx;
+
+ sbx.moveon = true;
+ sbx.fn = fn;
+ sbx.arg = arg;
+ sbx.array_arg_size = array_arg_size;
+
+ create_work_queue(&wq, (struct xfs_mount *)ctx, scrub_nproc(ctx));
+ if (ctx->fsinfo.fs_rt)
+ queue_work(&wq, xfs_scan_rt_blocks, ctx->geo.agcount + 1,
+ &sbx);
+ if (ctx->fsinfo.fs_log)
+ queue_work(&wq, xfs_scan_log_blocks, ctx->geo.agcount + 2,
+ &sbx);
+ for (agno = 0; agno < ctx->geo.agcount; agno++)
+ queue_work(&wq, xfs_scan_ag_blocks, agno, &sbx);
+ destroy_work_queue(&wq);
+
+ return sbx.moveon;
+}
+
+/* Routines to translate bad physical extents into file paths and offsets. */
+
+struct xfs_verify_error_info {
+ struct bitmap *d_bad; /* bytes */
+ struct bitmap *r_bad; /* bytes */
+};
+
+/* Report if this extent overlaps a bad region. */
+static bool
+xfs_report_verify_inode_bmap(
+ struct scrub_ctx *ctx,
+ const char *descr,
+ int fd,
+ int whichfork,
+ struct fsxattr *fsx,
+ struct xfs_bmap *bmap,
+ void *arg)
+{
+ struct xfs_verify_error_info *vei = arg;
+ struct bitmap *tree;
+
+ /* Only report errors for real extents. */
+ if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC))
+ return true;
+
+ if (fsx->fsx_xflags & FS_XFLAG_REALTIME)
+ tree = vei->r_bad;
+ else
+ tree = vei->d_bad;
+
+ if (!bitmap_has_extent(tree, bmap->bm_physical, bmap->bm_length))
+ return true;
+
+ str_error(ctx, descr,
+_("offset %llu failed read verification."), bmap->bm_offset);
+ return true;
+}
+
+/* Iterate the extent mappings of a file to report errors. */
+static bool
+xfs_report_verify_fd(
+ struct scrub_ctx *ctx,
+ const char *descr,
+ int fd,
+ void *arg)
+{
+ struct xfs_bmap key = {0};
+ bool moveon;
+
+ /* data fork */
+ moveon = xfs_iterate_bmap(ctx, descr, fd, XFS_DATA_FORK, &key,
+ xfs_report_verify_inode_bmap, arg);
+ if (!moveon)
+ return false;
+
+ /* attr fork */
+ moveon = xfs_iterate_bmap(ctx, descr, fd, XFS_ATTR_FORK, &key,
+ xfs_report_verify_inode_bmap, arg);
+ if (!moveon)
+ return false;
+ return true;
+}
+
+/* Report read verify errors in unlinked (but still open) files. */
+static int
+xfs_report_verify_inode(
+ struct scrub_ctx *ctx,
+ struct xfs_handle *handle,
+ struct xfs_bstat *bstat,
+ void *arg)
+{
+ char descr[DESCR_BUFSZ];
+ char buf[DESCR_BUFSZ];
+ bool moveon;
+ int fd;
+ int error;
+
+ snprintf(descr, DESCR_BUFSZ, _("inode %llu (unlinked)"), bstat->bs_ino);
+
+ /* Ignore linked files and things we can't open. */
+ if (bstat->bs_nlink != 0)
+ return 0;
+ if (!S_ISREG(bstat->bs_mode) && !S_ISDIR(bstat->bs_mode))
+ return 0;
+
+ /* Try to open the inode. */
+ fd = open_by_fshandle(handle, sizeof(*handle),
+ O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY);
+ if (fd < 0) {
+ error = errno;
+ if (error == ESTALE)
+ return error;
+
+ str_warn(ctx, descr, "%s", strerror_r(error, buf, DESCR_BUFSZ));
+ return error;
+ }
+
+ /* Go find the badness. */
+ moveon = xfs_report_verify_fd(ctx, descr, fd, arg);
+ close(fd);
+
+ return moveon ? 0 : XFS_ITERATE_INODES_ABORT;
+}
+
+/* Scan the inode associated with a directory entry. */
+static bool
+xfs_report_verify_dirent(
+ struct scrub_ctx *ctx,
+ const char *path,
+ int dir_fd,
+ struct dirent *dirent,
+ struct stat *sb,
+ void *arg)
+{
+ bool moveon;
+ int fd;
+
+ /* Ignore things we can't open. */
+ if (!S_ISREG(sb->st_mode) && !S_ISDIR(sb->st_mode))
+ return true;
+ /* Ignore . and .. */
+ if (dirent && (!strcmp(".", dirent->d_name) ||
+ !strcmp("..", dirent->d_name)))
+ return true;
+
+ /* Open the file */
+ fd = dirent_open(dir_fd, dirent);
+ if (fd < 0)
+ return true;
+
+ /* Go find the badness. */
+ moveon = xfs_report_verify_fd(ctx, path, fd, arg);
+ if (moveon)
+ goto out;
+
+out:
+ close(fd);
+
+ return moveon;
+}
+
+/* Given bad extent lists for the data & rtdev, find bad files. */
+static bool
+xfs_report_verify_errors(
+ struct scrub_ctx *ctx,
+ struct bitmap *d_bad,
+ struct bitmap *r_bad)
+{
+ struct xfs_verify_error_info vei;
+ bool moveon;
+
+ vei.d_bad = d_bad;
+ vei.r_bad = r_bad;
+
+ /* Scan the directory tree to get file paths. */
+ moveon = scan_fs_tree(ctx, NULL, xfs_report_verify_dirent, &vei);
+ if (!moveon)
+ return false;
+
+ /* Scan for unlinked files. */
+ return xfs_scan_all_inodes_arg(ctx, xfs_report_verify_inode, &vei);
+}
+
+/* Phase 1: Find filesystem geometry */
+
+/* Clean up the XFS-specific state data. */
+bool
+xfs_cleanup(
+ struct scrub_ctx *ctx)
+{
+ if (ctx->fshandle)
+ free_handle(ctx->fshandle, ctx->fshandle_len);
+ disk_close(&ctx->rtdev);
+ disk_close(&ctx->logdev);
+ disk_close(&ctx->datadev);
+
+ return true;
+}
+
+/* Read the XFS geometry. */
+bool
+xfs_scan_fs(
+ struct scrub_ctx *ctx)
+{
+ struct fs_path *fsp;
+ int error;
+
+ if (!platform_test_xfs_fd(ctx->mnt_fd)) {
+ str_error(ctx, ctx->mntpoint,
+_("Does not appear to be an XFS filesystem!"));
+ return false;
+ }
+
+ /*
+ * Flush everything out to disk before we start checking.
+ * This seems to reduce the incidence of stale file handle
+ * errors when we open things by handle.
+ */
+ error = syncfs(ctx->mnt_fd);
+ if (error) {
+ str_errno(ctx, ctx->mntpoint);
+ return false;
+ }
+
+ INIT_LIST_HEAD(&ctx->repair_list);
+ ctx->datadev.d_fd = ctx->logdev.d_fd = ctx->rtdev.d_fd = -1;
+
+ /* Retrieve XFS geometry. */
+ error = ioctl(ctx->mnt_fd, XFS_IOC_FSGEOMETRY, &ctx->geo);
+ if (error) {
+ str_errno(ctx, ctx->mntpoint);
+ goto err;
+ }
+
+ ctx->agblklog = libxfs_log2_roundup(ctx->geo.agblocks);
+ ctx->blocklog = libxfs_highbit32(ctx->geo.blocksize);
+ ctx->inodelog = libxfs_highbit32(ctx->geo.inodesize);
+ ctx->inopblog = ctx->blocklog - ctx->inodelog;
+
+ error = path_to_fshandle(ctx->mntpoint, &ctx->fshandle,
+ &ctx->fshandle_len);
+ if (error) {
+ perror(_("getting fshandle"));
+ goto err;
+ }
+
+ /* Do we have bulkstat? */
+ if (!xfs_can_iterate_inodes(ctx)) {
+ str_info(ctx, ctx->mntpoint, _("BULKSTAT is required."));
+ goto err;
+ }
+
+ /* Do we have getbmapx? */
+ if (!xfs_can_iterate_bmap(ctx)) {
+ str_info(ctx, ctx->mntpoint, _("GETBMAPX is required."));
+ goto err;
+ }
+
+ /* Do we have getfsmap? */
+ if (!xfs_can_iterate_fsmap(ctx)) {
+ str_info(ctx, ctx->mntpoint, _("GETFSMAP is required."));
+ goto err;
+ }
+
+ /* Do we have kernel-assisted metadata scrubbing? */
+ if (!xfs_can_scrub_fs_metadata(ctx) || !xfs_can_scrub_inode(ctx) ||
+ !xfs_can_scrub_bmap(ctx) || !xfs_can_scrub_dir(ctx) ||
+ !xfs_can_scrub_attr(ctx) || !xfs_can_scrub_symlink(ctx)) {
+ str_info(ctx, ctx->mntpoint,
+_("kernel metadata scrub is required."));
+ goto err;
+ }
+
+ /* Go find the XFS devices if we have a usable fsmap. */
+ fs_table_initialise(0, NULL, 0, NULL);
+ errno = 0;
+ fsp = fs_table_lookup(ctx->mntpoint, FS_MOUNT_POINT);
+ if (!fsp) {
+ str_error(ctx, ctx->mntpoint,
+_("Unable to find XFS information."));
+ goto err;
+ }
+ memcpy(&ctx->fsinfo, fsp, sizeof(struct fs_path));
+
+ /* Did we find the log and rt devices, if they're present? */
+ if (ctx->geo.logstart == 0 && ctx->fsinfo.fs_log == NULL) {
+ str_error(ctx, ctx->mntpoint,
+_("Unable to find log device path."));
+ goto err;
+ }
+ if (ctx->geo.rtblocks && ctx->fsinfo.fs_rt == NULL) {
+ str_error(ctx, ctx->mntpoint,
+_("Unable to find realtime device path."));
+ goto err;
+ }
+
+ /* Open the raw devices. */
+ error = disk_open(ctx->fsinfo.fs_name, &ctx->datadev);
+ if (error) {
+ str_errno(ctx, ctx->fsinfo.fs_name);
+ goto err;
+ }
+ ctx->nr_io_threads = libxfs_nproc();
+
+ if (ctx->fsinfo.fs_log) {
+ error = disk_open(ctx->fsinfo.fs_log, &ctx->logdev);
+ if (error) {
+ str_errno(ctx, ctx->fsinfo.fs_name);
+ goto err;
+ }
+ }
+ if (ctx->fsinfo.fs_rt) {
+ error = disk_open(ctx->fsinfo.fs_rt, &ctx->rtdev);
+ if (error) {
+ str_errno(ctx, ctx->fsinfo.fs_name);
+ goto err;
+ }
+ }
+
+ return true;
+err:
+ return false;
+}
+
+/* Phase 2: Check internal metadata. */
+
+/* Defer all the repairs until phase 4. */
+static void
+xfs_defer_repairs(
+ struct scrub_ctx *ctx,
+ struct list_head *repairs)
+{
+ if (list_empty(repairs))
+ return;
+
+ pthread_mutex_lock(&ctx->lock);
+ list_splice_tail_init(repairs, &ctx->repair_list);
+ pthread_mutex_unlock(&ctx->lock);
+}
+
+/* Repair some AG metadata; broken things are remembered for later. */
+static bool
+xfs_quick_repair(
+ struct scrub_ctx *ctx,
+ struct list_head *repairs)
+{
+ bool moveon;
+
+ moveon = xfs_repair_metadata_list(ctx, ctx->mnt_fd, repairs,
+ XRML_REPAIR_ONLY);
+ if (!moveon)
+ return moveon;
+
+ xfs_defer_repairs(ctx, repairs);
+ return true;
+}
+
+/* Scrub each AG's metadata btrees. */
+static void
+xfs_scan_ag_metadata(
+ struct work_queue *wq,
+ xfs_agnumber_t agno,
+ void *arg)
+{
+ struct scrub_ctx *ctx = (struct scrub_ctx *)wq->mp;
+ bool *pmoveon = arg;
+ struct repair_item *n;
+ struct repair_item *ri;
+ struct list_head repairs;
+ struct list_head repair_now;
+ unsigned int broken_primaries;
+ unsigned int broken_secondaries;
+ bool moveon;
+ char descr[DESCR_BUFSZ];
+
+ INIT_LIST_HEAD(&repairs);
+ INIT_LIST_HEAD(&repair_now);
+ snprintf(descr, DESCR_BUFSZ, _("AG %u"), agno);
+
+ /*
+ * First we scrub and fix the AG headers, because we need
+ * them to work well enough to check the AG btrees.
+ */
+ moveon = xfs_scrub_ag_headers(ctx, agno, &repairs);
+ if (!moveon)
+ goto err;
+
+ /* Repair header damage. */
+ moveon = xfs_quick_repair(ctx, &repairs);
+ if (!moveon)
+ goto err;
+
+ /* Now scrub the AG btrees. */
+ moveon = xfs_scrub_ag_metadata(ctx, agno, &repairs);
+ if (!moveon)
+ goto err;
+
+ /*
+ * Figure out if we need to perform early fixing. The only
+ * reason we need to do this is if the inobt is broken, which
+ * prevents phase 3 (inode scan) from running. We can rebuild
+ * the inobt from rmapbt data, but if the rmapbt is broken even
+ * at this early phase then we are sunk.
+ */
+ broken_secondaries = 0;
+ broken_primaries = 0;
+ list_for_each_entry_safe(ri, n, &repairs, list) {
+ switch (ri->op.sm_type) {
+ case XFS_SCRUB_TYPE_RMAPBT:
+ broken_secondaries++;
+ break;
+ case XFS_SCRUB_TYPE_FINOBT:
+ case XFS_SCRUB_TYPE_INOBT:
+ list_del(&ri->list);
+ list_add_tail(&ri->list, &repair_now);
+ /* fall through */
+ case XFS_SCRUB_TYPE_BNOBT:
+ case XFS_SCRUB_TYPE_CNTBT:
+ case XFS_SCRUB_TYPE_REFCNTBT:
+ broken_primaries++;
+ break;
+ default:
+ ASSERT(false);
+ break;
+ }
+ }
+ if (broken_secondaries && !debug_tweak_on("XFS_SCRUB_FORCE_REPAIR")) {
+ if (broken_primaries)
+ str_warn(ctx, descr,
+_("Corrupt primary and secondary block mapping metadata."));
+ else
+ str_warn(ctx, descr,
+_("Corrupt secondary block mapping metadata."));
+ str_warn(ctx, descr,
+_("Filesystem might not be repairable."));
+ }
+
+ /* Repair (inode) btree damage. */
+ moveon = xfs_quick_repair(ctx, &repair_now);
+ if (!moveon)
+ goto err;
+
+ /* Everything else gets fixed during phase 4. */
+ xfs_defer_repairs(ctx, &repairs);
+
+ return;
+err:
+ *pmoveon = false;
+ return;
+}
+
+/* Scrub whole-FS metadata btrees. */
+static void
+xfs_scan_fs_metadata(
+ struct work_queue *wq,
+ xfs_agnumber_t agno,
+ void *arg)
+{
+ struct scrub_ctx *ctx = (struct scrub_ctx *)wq->mp;
+ bool *pmoveon = arg;
+ struct list_head repairs;
+ bool moveon;
+
+ INIT_LIST_HEAD(&repairs);
+ moveon = xfs_scrub_fs_metadata(ctx, &repairs);
+ if (!moveon)
+ *pmoveon = false;
+
+ pthread_mutex_lock(&ctx->lock);
+ list_splice_tail_init(&repairs, &ctx->repair_list);
+ pthread_mutex_unlock(&ctx->lock);
+}
+
+/* Try to scan metadata via sysfs. */
+bool
+xfs_scan_metadata(
+ struct scrub_ctx *ctx)
+{
+ xfs_agnumber_t agno;
+ struct work_queue wq;
+ bool moveon = true;
+
+ create_work_queue(&wq, (struct xfs_mount *)ctx, scrub_nproc(ctx));
+ queue_work(&wq, xfs_scan_fs_metadata, 0, &moveon);
+ for (agno = 0; agno < ctx->geo.agcount; agno++)
+ queue_work(&wq, xfs_scan_ag_metadata, agno, &moveon);
+ destroy_work_queue(&wq);
+
+ return moveon;
+}
+
+/* Phase 3: Scan all inodes. */
+
+/*
+ * Scrub part of a file. If the user passes in a valid fd we assume
+ * that's the file to check; otherwise, pass in the inode number and
+ * let the kernel sort it out.
+ */
+static bool
+xfs_scrub_fd(
+ struct scrub_ctx *ctx,
+ bool (*fn)(struct scrub_ctx *, uint64_t,
+ uint32_t, int, struct list_head *),
+ struct xfs_bstat *bs,
+ int fd,
+ struct list_head *repairs)
+{
+ if (fd < 0)
+ fd = ctx->mnt_fd;
+ return fn(ctx, bs->bs_ino, bs->bs_gen, ctx->mnt_fd, repairs);
+}
+
+/* Verify the contents, xattrs, and extent maps of an inode. */
+static int
+xfs_scrub_inode(
+ struct scrub_ctx *ctx,
+ struct xfs_handle *handle,
+ struct xfs_bstat *bstat,
+ void *arg)
+{
+ struct list_head repairs;
+ char descr[DESCR_BUFSZ];
+ bool moveon = true;
+ int fd = -1;
+ int error = 0;
+
+ INIT_LIST_HEAD(&repairs);
+ snprintf(descr, DESCR_BUFSZ, _("inode %llu"), bstat->bs_ino);
+
+ /* Try to open the inode to pin it. */
+ if (S_ISREG(bstat->bs_mode) || S_ISDIR(bstat->bs_mode)) {
+ fd = open_by_fshandle(handle, sizeof(*handle),
+ O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY);
+ if (fd < 0) {
+ error = errno;
+ if (error != ESTALE)
+ str_errno(ctx, descr);
+ goto out;
+ }
+ }
+
+ /* Scrub the inode. */
+ moveon = xfs_scrub_fd(ctx, xfs_scrub_inode_fields, bstat, fd,
+ &repairs);
+ if (!moveon)
+ goto out;
+
+ moveon = xfs_quick_repair(ctx, &repairs);
+ if (!moveon)
+ goto out;
+
+ /* Scrub all block mappings. */
+ moveon = xfs_scrub_fd(ctx, xfs_scrub_data_fork, bstat, fd,
+ &repairs);
+ if (!moveon)
+ goto out;
+ moveon = xfs_scrub_fd(ctx, xfs_scrub_attr_fork, bstat, fd,
+ &repairs);
+ if (!moveon)
+ goto out;
+ moveon = xfs_scrub_fd(ctx, xfs_scrub_cow_fork, bstat, fd,
+ &repairs);
+ if (!moveon)
+ goto out;
+
+ moveon = xfs_quick_repair(ctx, &repairs);
+ if (!moveon)
+ goto out;
+
+ /* XXX: Some day, check child -> parent dir -> child. */
+
+ if (S_ISLNK(bstat->bs_mode)) {
+ /* Check symlink contents. */
+ moveon = xfs_scrub_symlink(ctx, bstat->bs_ino,
+ bstat->bs_gen, ctx->mnt_fd, &repairs);
+ } else if (S_ISDIR(bstat->bs_mode)) {
+ /* Check the directory entries. */
+ moveon = xfs_scrub_fd(ctx, xfs_scrub_dir, bstat, fd, &repairs);
+ }
+ if (!moveon)
+ goto out;
+
+ /*
+ * Read all the extended attributes. If any of the read
+ * functions decline to move on, we can try again with the
+ * VFS functions if we have a file descriptor.
+ */
+ moveon = xfs_scrub_fd(ctx, xfs_scrub_attr, bstat, fd, &repairs);
+ if (!moveon)
+ goto out;
+
+ moveon = xfs_quick_repair(ctx, &repairs);
+
+out:
+ xfs_defer_repairs(ctx, &repairs);
+ if (fd >= 0)
+ close(fd);
+ if (error)
+ return error;
+ return moveon ? 0 : XFS_ITERATE_INODES_ABORT;
+}
+
+/* Verify all the inodes in a filesystem. */
+bool
+xfs_scan_inodes(
+ struct scrub_ctx *ctx)
+{
+ if (!xfs_scan_all_inodes(ctx, xfs_scrub_inode))
+ return false;
+ xfs_scrub_report_preen_triggers(ctx);
+ return true;
+}
+
+/* Phase 4: Repair filesystem. */
+
+static int
+list_length(
+ struct list_head *head)
+{
+ struct list_head *pos;
+ int nr = 0;
+
+ list_for_each(pos, head) {
+ nr++;
+ }
+
+ return nr;
+}
+
+/* Fix the per-AG and per-FS metadata. */
+bool
+xfs_repair_fs(
+ struct scrub_ctx *ctx)
+{
+ int len;
+ int old_len;
+ bool moveon;
+
+ /* Repair anything broken until we fail to make progress. */
+ len = list_length(&ctx->repair_list);
+ do {
+ old_len = len;
+ moveon = xfs_repair_metadata_list(ctx, ctx->mnt_fd,
+ &ctx->repair_list, 0);
+ if (!moveon)
+ return false;
+ len = list_length(&ctx->repair_list);
+ } while (old_len > len);
+
+ /* Try once more, but this time complain if we can't fix things. */
+ moveon = xfs_repair_metadata_list(ctx, ctx->mnt_fd,
+ &ctx->repair_list, XRML_NOFIX_COMPLAIN);
+ if (!moveon)
+ return false;
+
+ fstrim(ctx);
+ return true;
+}
+
+/* Phase 5: Verify data file integrity. */
+
+/* Verify disk blocks with GETFSMAP */
+
+struct xfs_verify_extent {
+ /* Maintain state for the lazy read verifier. */
+ struct read_verify rv;
+
+ /* Store bad extents if we don't have parent pointers. */
+ struct bitmap *d_bad; /* bytes */
+ struct bitmap *r_bad; /* bytes */
+
+ /* Track the last extent we saw. */
+ uint64_t laststart; /* bytes */
+ uint64_t lastlength; /* bytes */
+ bool lastshared; /* bytes */
+};
+
+/* Report an IO error resulting from read-verify based off getfsmap. */
+static bool
+xfs_check_rmap_error_report(
+ struct scrub_ctx *ctx,
+ const char *descr,
+ struct fsmap *map,
+ void *arg)
+{
+ const char *type;
+ char buf[32];
+ uint64_t err_physical = *(uint64_t *)arg;
+ uint64_t err_off;
+
+ if (err_physical > map->fmr_physical)
+ err_off = err_physical - map->fmr_physical;
+ else
+ err_off = 0;
+
+ snprintf(buf, 32, _("disk offset %llu"),
+ BTOBB(map->fmr_physical + err_off));
+
+ if (map->fmr_flags & FMR_OF_SPECIAL_OWNER) {
+ type = xfs_decode_special_owner(map->fmr_owner);
+ str_error(ctx, buf,
+_("%s failed read verification."),
+ type);
+ } else if (xfs_scrub_can_getparent(ctx)) {
+ /* XXX: go find the parent path */
+ str_error(ctx, buf,
+_("XXX: inode %lld offset %llu failed read verification."),
+ map->fmr_owner, map->fmr_offset + err_off);
+ }
+ return true;
+}
+
+/* Handle a read error in the rmap-based read verify. */
+void
+xfs_check_rmap_ioerr(
+ struct read_verify_pool *rvp,
+ struct disk *disk,
+ uint64_t start,
+ uint64_t length,
+ int error,
+ void *arg)
+{
+ struct fsmap keys[2];
+ char descr[DESCR_BUFSZ];
+ struct scrub_ctx *ctx = rvp->rvp_ctx;
+ struct xfs_verify_extent *ve;
+ struct bitmap *tree;
+ dev_t dev;
+ bool moveon;
+
+ ve = arg;
+ dev = xfs_disk_to_dev(ctx, disk);
+
+ /*
+ * If we don't have parent pointers, save the bad extent for
+ * later rescanning.
+ */
+ if (!xfs_scrub_can_getparent(ctx)) {
+ if (dev == ctx->fsinfo.fs_datadev)
+ tree = ve->d_bad;
+ else if (dev == ctx->fsinfo.fs_rtdev)
+ tree = ve->r_bad;
+ else
+ tree = NULL;
+ if (tree) {
+ moveon = bitmap_add(tree, start, length);
+ if (!moveon)
+ str_errno(ctx, ctx->mntpoint);
+ }
+ }
+
+ snprintf(descr, DESCR_BUFSZ, _("dev %d:%d ioerr @ %"PRIu64":%"PRIu64" "),
+ major(dev), minor(dev), start, length);
+
+ /* Go figure out which blocks are bad from the fsmap. */
+ memset(keys, 0, sizeof(struct fsmap) * 2);
+ keys->fmr_device = dev;
+ keys->fmr_physical = start;
+ (keys + 1)->fmr_device = dev;
+ (keys + 1)->fmr_physical = start + length - 1;
+ (keys + 1)->fmr_owner = ULLONG_MAX;
+ (keys + 1)->fmr_offset = ULLONG_MAX;
+ (keys + 1)->fmr_flags = UINT_MAX;
+ xfs_iterate_fsmap(ctx, descr, keys, xfs_check_rmap_error_report,
+ &start);
+}
+
+/* Read verify a (data block) extent. */
+static bool
+xfs_check_rmap(
+ struct scrub_ctx *ctx,
+ const char *descr,
+ struct fsmap *map,
+ void *arg)
+{
+ struct xfs_verify_extent *ve = arg;
+ struct disk *disk;
+
+ dbg_printf("rmap dev %d:%d phys %llu owner %lld offset %llu "
+ "len %llu flags 0x%x\n", major(map->fmr_device),
+ minor(map->fmr_device), map->fmr_physical,
+ map->fmr_owner, map->fmr_offset,
+ map->fmr_length, map->fmr_flags);
+
+ /* Remember this extent. */
+ ve->lastshared = (map->fmr_flags & FMR_OF_SHARED);
+ ve->laststart = map->fmr_physical;
+ ve->lastlength = map->fmr_length;
+
+ /* "Unknown" extents should be verified; they could be data. */
+ if ((map->fmr_flags & FMR_OF_SPECIAL_OWNER) &&
+ map->fmr_owner == XFS_FMR_OWN_UNKNOWN)
+ map->fmr_flags &= ~FMR_OF_SPECIAL_OWNER;
+
+ /*
+ * We only care about read-verifying data extents that have been
+ * written to disk. This means we can skip "special" owners
+ * (metadata), xattr blocks, unwritten extents, and extent maps.
+ * These should all get checked elsewhere in the scrubber.
+ */
+ if (map->fmr_flags & (FMR_OF_PREALLOC | FMR_OF_ATTR_FORK |
+ FMR_OF_EXTENT_MAP | FMR_OF_SPECIAL_OWNER))
+ goto out;
+
+ /* XXX: Filter out directory data blocks. */
+
+ /* Schedule the read verify command for (eventual) running. */
+ disk = xfs_dev_to_disk(ctx, map->fmr_device);
+
+ read_verify_schedule(&ctx->rvp, &ve->rv, disk, map->fmr_physical,
+ map->fmr_length, ve);
+
+out:
+ /* Is this the last extent? Fire off the read. */
+ if (map->fmr_flags & FMR_OF_LAST)
+ read_verify_force(&ctx->rvp, &ve->rv);
+
+ return true;
+}
+
+/* Verify all the blocks in a filesystem. */
+bool
+xfs_scan_blocks(
+ struct scrub_ctx *ctx)
+{
+ struct bitmap d_bad;
+ struct bitmap r_bad;
+ struct xfs_verify_extent *ve;
+ struct xfs_verify_extent *v;
+ int i;
+ unsigned int groups;
+ bool moveon;
+
+ /*
+ * Initialize our per-thread context. By convention,
+ * the log device comes first, then the rt device, and then
+ * the AGs.
+ */
+ groups = xfs_scan_all_blocks_array_size(ctx);
+ ve = calloc(groups, sizeof(struct xfs_verify_extent));
+ if (!ve) {
+ str_errno(ctx, ctx->mntpoint);
+ return false;
+ }
+
+ moveon = bitmap_init(&d_bad);
+ if (!moveon) {
+ str_errno(ctx, ctx->mntpoint);
+ goto out_ve;
+ }
+
+ moveon = bitmap_init(&r_bad);
+ if (!moveon) {
+ str_errno(ctx, ctx->mntpoint);
+ goto out_dbad;
+ }
+
+ for (i = 0, v = ve; i < groups; i++, v++) {
+ v->d_bad = &d_bad;
+ v->r_bad = &r_bad;
+ }
+
+ moveon = xfs_read_verify_pool_init(ctx, xfs_check_rmap_ioerr);
+ if (!moveon)
+ goto out_rbad;
+ moveon = xfs_scan_all_blocks_array_arg(ctx, xfs_check_rmap,
+ ve, sizeof(*ve));
+ if (!moveon)
+ goto out_pool;
+
+ for (i = 0, v = ve; i < groups; i++, v++)
+ read_verify_force(&ctx->rvp, &v->rv);
+ read_verify_pool_destroy(&ctx->rvp);
+
+ /* Scan the whole dir tree to see what matches the bad extents. */
+ if (!bitmap_empty(&d_bad) || !bitmap_empty(&r_bad))
+ moveon = xfs_report_verify_errors(ctx, &d_bad, &r_bad);
+
+ bitmap_free(&r_bad);
+ bitmap_free(&d_bad);
+ free(ve);
+ return moveon;
+
+out_pool:
+ read_verify_pool_destroy(&ctx->rvp);
+out_rbad:
+ bitmap_free(&r_bad);
+out_dbad:
+ bitmap_free(&d_bad);
+out_ve:
+ free(ve);
+ return moveon;
+}
+
+/* Phase 6: Check summary counters. */
+
+struct xfs_summary_counts {
+ unsigned long long inodes; /* number of inodes */
+ unsigned long long dbytes; /* data dev bytes */
+ unsigned long long rbytes; /* rt dev bytes */
+ unsigned long long next_phys; /* next phys bytes we see? */
+ unsigned long long agbytes; /* freespace bytes */
+ struct bitmap dext; /* data block extent bitmap */
+ struct bitmap rext; /* rt block extent bitmap */
+};
+
+struct xfs_inode_fork_summary {
+ struct bitmap *tree;
+ unsigned long long bytes;
+};
+
+/* Record inode and block usage. */
+static int
+xfs_record_inode_summary(
+ struct scrub_ctx *ctx,
+ struct xfs_handle *handle,
+ struct xfs_bstat *bstat,
+ void *arg)
+{
+ struct xfs_summary_counts *counts = arg;
+
+ counts->inodes++;
+ return 0;
+}
+
+/* Record block usage. */
+static bool
+xfs_record_block_summary(
+ struct scrub_ctx *ctx,
+ const char *descr,
+ struct fsmap *fsmap,
+ void *arg)
+{
+ struct xfs_summary_counts *counts = arg;
+ unsigned long long len;
+
+ if (fsmap->fmr_device == ctx->fsinfo.fs_logdev)
+ return true;
+ if ((fsmap->fmr_flags & FMR_OF_SPECIAL_OWNER) &&
+ fsmap->fmr_owner == XFS_FMR_OWN_FREE)
+ return true;
+
+ len = fsmap->fmr_length;
+
+ /* freesp btrees live in free space, need to adjust counters later. */
+ if ((fsmap->fmr_flags & FMR_OF_SPECIAL_OWNER) &&
+ fsmap->fmr_owner == XFS_FMR_OWN_AG) {
+ counts->agbytes += fsmap->fmr_length;
+ }
+ if (fsmap->fmr_device == ctx->fsinfo.fs_rtdev) {
+ /* Count realtime extents. */
+ counts->rbytes += len;
+ } else {
+ /* Count datadev extents. */
+ if (counts->next_phys >= fsmap->fmr_physical + len)
+ return true;
+ else if (counts->next_phys > fsmap->fmr_physical)
+ len = counts->next_phys - fsmap->fmr_physical;
+ counts->dbytes += len;
+ counts->next_phys = fsmap->fmr_physical + fsmap->fmr_length;
+ }
+
+ return true;
+}
+
+/* Count all inodes and blocks in the filesystem, compare to superblock. */
+bool
+xfs_check_summary(
+ struct scrub_ctx *ctx)
+{
+ struct xfs_fsop_counts fc;
+ struct xfs_fsop_resblks rb;
+ struct xfs_fsop_ag_resblks arb;
+ struct statvfs sfs;
+ struct xfs_summary_counts *summary;
+ unsigned long long fd;
+ unsigned long long fr;
+ unsigned long long fi;
+ unsigned long long sd;
+ unsigned long long sr;
+ unsigned long long si;
+ unsigned long long absdiff;
+ xfs_agnumber_t agno;
+ bool moveon;
+ bool complain;
+ unsigned int groups;
+ int error;
+
+ groups = xfs_scan_all_blocks_array_size(ctx);
+ summary = calloc(groups, sizeof(struct xfs_summary_counts));
+ if (!summary) {
+ str_errno(ctx, ctx->mntpoint);
+ return false;
+ }
+
+ /* Flush everything out to disk before we start counting. */
+ error = syncfs(ctx->mnt_fd);
+ if (error) {
+ str_errno(ctx, ctx->mntpoint);
+ return false;
+ }
+
+ /* Use fsmap to count blocks. */
+ moveon = xfs_scan_all_blocks_array_arg(ctx, xfs_record_block_summary,
+ summary, sizeof(*summary));
+ if (!moveon)
+ goto out;
+
+ /* Scan the whole fs. */
+ moveon = xfs_scan_all_inodes_array_arg(ctx, xfs_record_inode_summary,
+ summary, sizeof(*summary));
+ if (!moveon)
+ goto out;
+
+ /* Sum the counts. */
+ for (agno = 1; agno < groups; agno++) {
+ summary[0].inodes += summary[agno].inodes;
+ summary[0].dbytes += summary[agno].dbytes;
+ summary[0].rbytes += summary[agno].rbytes;
+ summary[0].agbytes += summary[agno].agbytes;
+ }
+
+ /* Fetch the filesystem counters. */
+ error = ioctl(ctx->mnt_fd, XFS_IOC_FSCOUNTS, &fc);
+ if (error)
+ str_errno(ctx, ctx->mntpoint);
+
+ /* Grab the fstatvfs counters, since it has to report accurately. */
+ error = fstatvfs(ctx->mnt_fd, &sfs);
+ if (error) {
+ str_errno(ctx, ctx->mntpoint);
+ return false;
+ }
+
+ /*
+ * XFS reserves some blocks to prevent hard ENOSPC, so add those
+ * blocks back to the free data counts.
+ */
+ error = ioctl(ctx->mnt_fd, XFS_IOC_GET_RESBLKS, &rb);
+ if (error)
+ str_errno(ctx, ctx->mntpoint);
+ sfs.f_bfree += rb.resblks_avail;
+
+ /*
+ * XFS with rmap or reflink reserves blocks in each AG to
+ * prevent the AG from running out of space for metadata blocks.
+ * Add those back to the free data counts.
+ */
+ memset(&arb, 0, sizeof(arb));
+ error = ioctl(ctx->mnt_fd, XFS_IOC_GET_AG_RESBLKS, &arb);
+ if (error && errno != ENOTTY)
+ str_errno(ctx, ctx->mntpoint);
+ sfs.f_bfree += arb.resblks;
+
+ /*
+ * If we counted blocks with fsmap, then dblocks includes
+ * blocks for the AGFL and the freespace/rmap btrees. The
+ * filesystem treats them as "free", but since we scanned
+ * them, we'll consider them used.
+ */
+ sfs.f_bfree -= summary[0].agbytes >> ctx->blocklog;
+
+ /* Report on what we found. */
+ fd = (ctx->geo.datablocks - sfs.f_bfree) << ctx->blocklog;
+ fr = (ctx->geo.rtblocks - fc.freertx) << ctx->blocklog;
+ fi = sfs.f_files - sfs.f_ffree;
+ sd = summary[0].dbytes;
+ sr = summary[0].rbytes;
+ si = summary[0].inodes;
+
+ /*
+ * Complain if the counts are off by more than 10% unless
+ * the inaccuracy is less than 32MB worth of blocks or 100 inodes.
+ */
+ absdiff = 1ULL << 25;
+ complain = !within_range(ctx, sd, fd, absdiff, 1, 10, _("data blocks"));
+ complain |= !within_range(ctx, sr, fr, absdiff, 1, 10, _("realtime blocks"));
+ complain |= !within_range(ctx, si, fi, 100, 1, 10, _("inodes"));
+
+ if (complain || verbose) {
+ double d, r, i;
+ char *du, *ru, *iu;
+
+ if (fr || sr) {
+ d = auto_space_units(fd, &du);
+ r = auto_space_units(fr, &ru);
+ i = auto_units(fi, &iu);
+ fprintf(stdout,
+_("%.1f%s data used; %.1f%s realtime data used; %.2f%s inodes used.\n"),
+ d, du, r, ru, i, iu);
+ d = auto_space_units(sd, &du);
+ r = auto_space_units(sr, &ru);
+ i = auto_units(si, &iu);
+ fprintf(stdout,
+_("%.1f%s data found; %.1f%s realtime data found; %.2f%s inodes found.\n"),
+ d, du, r, ru, i, iu);
+ } else {
+ d = auto_space_units(fd, &du);
+ i = auto_units(fi, &iu);
+ fprintf(stdout,
+_("%.1f%s data used; %.1f%s inodes used.\n"),
+ d, du, i, iu);
+ d = auto_space_units(sd, &du);
+ i = auto_units(si, &iu);
+ fprintf(stdout,
+_("%.1f%s data found; %.1f%s inodes found.\n"),
+ d, du, i, iu);
+ }
+ fflush(stdout);
+ }
+ moveon = true;
+
+out:
+ for (agno = 0; agno < groups; agno++) {
+ bitmap_free(&summary[agno].dext);
+ bitmap_free(&summary[agno].rext);
+ }
+ free(summary);
+ return moveon;
+}
+
+/* Shut down the filesystem. */
+void
+xfs_shutdown_fs(
+ struct scrub_ctx *ctx)
+{
+ int flag;
+
+ flag = XFS_FSOP_GOING_FLAGS_LOGFLUSH;
+ str_info(ctx, ctx->mntpoint, _("Shutting down filesystem!"));
+ if (ioctl(ctx->mnt_fd, XFS_IOC_GOINGDOWN, &flag))
+ str_errno(ctx, ctx->mntpoint);
+}
new file mode 100644
@@ -0,0 +1,968 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "libxfs.h"
+#include <sys/statvfs.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include "disk.h"
+#include "../repair/threads.h"
+#include "handle.h"
+#include "path.h"
+#include "read_verify.h"
+#include "scrub.h"
+#include "xfs_ioctl.h"
+
+#define FSMAP_NR 65536
+#define BMAP_NR 2048
+
+/* Call the handler function. */
+static int
+xfs_iterate_inode_func(
+ struct scrub_ctx *ctx,
+ xfs_inode_iter_fn fn,
+ struct xfs_bstat *bs,
+ struct xfs_handle *handle,
+ void *arg)
+{
+ int error;
+
+ handle->ha_fid.fid_ino = bs->bs_ino;
+ handle->ha_fid.fid_gen = bs->bs_gen;
+ error = fn(ctx, handle, bs, arg);
+ if (error)
+ return error;
+ if (xfs_scrub_excessive_errors(ctx))
+ return XFS_ITERATE_INODES_ABORT;
+ return 0;
+}
+
+/* Iterate a range of inodes. */
+bool
+xfs_iterate_inodes(
+ struct scrub_ctx *ctx,
+ const char *descr,
+ void *fshandle,
+ uint64_t first_ino,
+ uint64_t last_ino,
+ xfs_inode_iter_fn fn,
+ void *arg)
+{
+ struct xfs_fsop_bulkreq igrpreq = {0};
+ struct xfs_fsop_bulkreq bulkreq = {0};
+ struct xfs_fsop_bulkreq onereq = {0};
+ struct xfs_handle handle;
+ struct xfs_inogrp inogrp;
+ struct xfs_bstat bstat[XFS_INODES_PER_CHUNK] = {0};
+ char idescr[DESCR_BUFSZ];
+ char buf[DESCR_BUFSZ];
+ struct xfs_bstat *bs;
+ __u64 last_stale = first_ino - 1;
+ __u64 igrp_ino;
+ __u64 oneino;
+ __u64 ino;
+ __s32 bulklen = 0;
+ __s32 onelen = 0;
+ __s32 igrplen = 0;
+ bool moveon = true;
+ int i;
+ int error;
+ int stale_count = 0;
+
+ assert(!debug_tweak_on("XFS_SCRUB_NO_BULKSTAT"));
+
+ onereq.lastip = &oneino;
+ onereq.icount = 1;
+ onereq.ocount = &onelen;
+
+ bulkreq.lastip = &ino;
+ bulkreq.icount = XFS_INODES_PER_CHUNK;
+ bulkreq.ubuffer = &bstat;
+ bulkreq.ocount = &bulklen;
+
+ igrpreq.lastip = &igrp_ino;
+ igrpreq.icount = 1;
+ igrpreq.ubuffer = &inogrp;
+ igrpreq.ocount = &igrplen;
+
+ memcpy(&handle.ha_fsid, fshandle, sizeof(handle.ha_fsid));
+ handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
+ sizeof(handle.ha_fid.fid_len);
+ handle.ha_fid.fid_pad = 0;
+
+ /* Find the inode chunk & alloc mask */
+ igrp_ino = first_ino;
+ error = ioctl(ctx->mnt_fd, XFS_IOC_FSINUMBERS, &igrpreq);
+ while (!error && igrplen) {
+ /* Load the inodes. */
+ ino = inogrp.xi_startino - 1;
+ bulkreq.icount = inogrp.xi_alloccount;
+ error = ioctl(ctx->mnt_fd, XFS_IOC_FSBULKSTAT, &bulkreq);
+ if (error)
+ str_warn(ctx, descr, "%s", strerror_r(errno,
+ buf, DESCR_BUFSZ));
+
+ /* Did we get exactly the inodes we expected? */
+ for (i = 0, bs = bstat; i < XFS_INODES_PER_CHUNK; i++) {
+ if (!(inogrp.xi_allocmask & (1ULL << i)))
+ continue;
+ if (bs->bs_ino == inogrp.xi_startino + i) {
+ bs++;
+ continue;
+ }
+
+ /* Load the one inode. */
+ oneino = inogrp.xi_startino + i;
+ onereq.ubuffer = bs;
+ error = ioctl(ctx->mnt_fd, XFS_IOC_FSBULKSTAT_SINGLE,
+ &onereq);
+ if (error || bs->bs_ino != inogrp.xi_startino + i) {
+ memset(bs, 0, sizeof(struct xfs_bstat));
+ bs->bs_ino = inogrp.xi_startino + i;
+ bs->bs_blksize = ctx->mnt_sv.f_frsize;
+ }
+ bs++;
+ }
+
+ /* Iterate all the inodes. */
+ for (i = 0, bs = bstat; i < inogrp.xi_alloccount; i++, bs++) {
+ if (bs->bs_ino > last_ino)
+ goto out;
+
+ error = xfs_iterate_inode_func(ctx, fn, bs, &handle,
+ arg);
+ switch (error) {
+ case 0:
+ break;
+ case ESTALE:
+ if (last_stale == inogrp.xi_startino)
+ stale_count++;
+ else {
+ last_stale = inogrp.xi_startino;
+ stale_count = 0;
+ }
+ if (stale_count < 30) {
+ igrp_ino = inogrp.xi_startino;
+ goto igrp_retry;
+ }
+ snprintf(idescr, DESCR_BUFSZ, "inode %llu",
+ bs->bs_ino);
+ str_warn(ctx, idescr, "%s", strerror_r(error,
+ buf, DESCR_BUFSZ));
+ break;
+ case XFS_ITERATE_INODES_ABORT:
+ error = 0;
+ /* fall thru */
+ default:
+ moveon = false;
+ errno = error;
+ goto err;
+ }
+ }
+
+igrp_retry:
+ error = ioctl(ctx->mnt_fd, XFS_IOC_FSINUMBERS, &igrpreq);
+ }
+
+err:
+ if (error) {
+ str_errno(ctx, descr);
+ moveon = false;
+ }
+out:
+ return moveon;
+}
+
+/* Does the kernel support bulkstat? */
+bool
+xfs_can_iterate_inodes(
+ struct scrub_ctx *ctx)
+{
+ struct xfs_fsop_bulkreq bulkreq;
+ __u64 lastino;
+ __s32 bulklen = 0;
+ int error;
+
+ if (debug_tweak_on("XFS_SCRUB_NO_BULKSTAT"))
+ return false;
+
+ lastino = 0;
+ memset(&bulkreq, 0, sizeof(bulkreq));
+ bulkreq.lastip = (__u64 *)&lastino;
+ bulkreq.icount = 0;
+ bulkreq.ubuffer = NULL;
+ bulkreq.ocount = &bulklen;
+
+ error = ioctl(ctx->mnt_fd, XFS_IOC_FSBULKSTAT, &bulkreq);
+ return error == -1 && errno == EINVAL;
+}
+
+/* Iterate all the extent block mappings between the two keys. */
+bool
+xfs_iterate_bmap(
+ struct scrub_ctx *ctx,
+ const char *descr,
+ int fd,
+ int whichfork,
+ struct xfs_bmap *key,
+ xfs_bmap_iter_fn fn,
+ void *arg)
+{
+ struct fsxattr fsx;
+ struct getbmapx *map;
+ struct getbmapx *p;
+ struct xfs_bmap bmap;
+ char bmap_descr[DESCR_BUFSZ];
+ bool moveon = true;
+ xfs_off_t new_off;
+ int getxattr_type;
+ int i;
+ int error;
+
+ assert(!debug_tweak_on("XFS_SCRUB_NO_BMAP"));
+
+ switch (whichfork) {
+ case XFS_ATTR_FORK:
+ snprintf(bmap_descr, DESCR_BUFSZ, _("%s attr"), descr);
+ break;
+ case XFS_COW_FORK:
+ snprintf(bmap_descr, DESCR_BUFSZ, _("%s CoW"), descr);
+ break;
+ case XFS_DATA_FORK:
+ snprintf(bmap_descr, DESCR_BUFSZ, _("%s data"), descr);
+ break;
+ default:
+ assert(0);
+ }
+
+ map = calloc(BMAP_NR, sizeof(struct getbmapx));
+ if (!map) {
+ str_errno(ctx, bmap_descr);
+ return false;
+ }
+
+ map->bmv_offset = BTOBB(key->bm_offset);
+ map->bmv_block = BTOBB(key->bm_physical);
+ if (key->bm_length == 0)
+ map->bmv_length = ULLONG_MAX;
+ else
+ map->bmv_length = BTOBB(key->bm_length);
+ map->bmv_count = BMAP_NR;
+ map->bmv_iflags = BMV_IF_NO_DMAPI_READ | BMV_IF_PREALLOC |
+ BMV_OF_DELALLOC | BMV_IF_NO_HOLES;
+ switch (whichfork) {
+ case XFS_ATTR_FORK:
+ getxattr_type = XFS_IOC_FSGETXATTRA;
+ map->bmv_iflags |= BMV_IF_ATTRFORK;
+ break;
+ case XFS_COW_FORK:
+ map->bmv_iflags |= BMV_IF_COWFORK;
+ getxattr_type = FS_IOC_FSGETXATTR;
+ break;
+ case XFS_DATA_FORK:
+ getxattr_type = FS_IOC_FSGETXATTR;
+ break;
+ default:
+ abort();
+ }
+
+ error = ioctl(fd, getxattr_type, &fsx);
+ if (error < 0) {
+ str_errno(ctx, bmap_descr);
+ moveon = false;
+ goto out;
+ }
+
+ while ((error = ioctl(fd, XFS_IOC_GETBMAPX, map)) == 0) {
+ for (i = 0, p = &map[i + 1]; i < map->bmv_entries; i++, p++) {
+ bmap.bm_offset = BBTOB(p->bmv_offset);
+ bmap.bm_physical = BBTOB(p->bmv_block);
+ bmap.bm_length = BBTOB(p->bmv_length);
+ bmap.bm_flags = p->bmv_oflags;
+ moveon = fn(ctx, bmap_descr, fd, whichfork, &fsx,
+ &bmap, arg);
+ if (!moveon)
+ goto out;
+ if (xfs_scrub_excessive_errors(ctx)) {
+ moveon = false;
+ goto out;
+ }
+ }
+
+ if (map->bmv_entries == 0)
+ break;
+ p = map + map->bmv_entries;
+ if (p->bmv_oflags & BMV_OF_LAST)
+ break;
+
+ new_off = p->bmv_offset + p->bmv_length;
+ map->bmv_length -= new_off - map->bmv_offset;
+ map->bmv_offset = new_off;
+ }
+
+ /* Pre-reflink filesystems don't know about CoW forks. */
+ if (whichfork == XFS_COW_FORK && error && errno == EINVAL)
+ error = 0;
+
+ if (error)
+ str_errno(ctx, bmap_descr);
+out:
+ memcpy(key, map, sizeof(struct getbmapx));
+ free(map);
+ return moveon;
+}
+
+/* Does the kernel support getbmapx? */
+bool
+xfs_can_iterate_bmap(
+ struct scrub_ctx *ctx)
+{
+ struct getbmapx bsm[2];
+ int error;
+
+ if (debug_tweak_on("XFS_SCRUB_NO_BMAP"))
+ return false;
+
+ memset(bsm, 0, sizeof(struct getbmapx));
+ bsm->bmv_length = ULLONG_MAX;
+ bsm->bmv_count = 2;
+ error = ioctl(ctx->mnt_fd, XFS_IOC_GETBMAPX, bsm);
+ return error == 0;
+}
+
+/* Iterate all the fs block mappings between the two keys. */
+bool
+xfs_iterate_fsmap(
+ struct scrub_ctx *ctx,
+ const char *descr,
+ struct fsmap *keys,
+ xfs_fsmap_iter_fn fn,
+ void *arg)
+{
+ struct fsmap_head *head;
+ struct fsmap *p;
+ bool moveon = true;
+ int i;
+ int error;
+
+ assert(!debug_tweak_on("XFS_SCRUB_NO_FSMAP"));
+
+ head = malloc(fsmap_sizeof(FSMAP_NR));
+ if (!head) {
+ str_errno(ctx, descr);
+ return false;
+ }
+
+ memset(head, 0, sizeof(*head));
+ memcpy(head->fmh_keys, keys, sizeof(struct fsmap) * 2);
+ head->fmh_count = FSMAP_NR;
+
+ while ((error = ioctl(ctx->mnt_fd, FS_IOC_GETFSMAP, head)) == 0) {
+ for (i = 0, p = head->fmh_recs;
+ i < head->fmh_entries;
+ i++, p++) {
+ moveon = fn(ctx, descr, p, arg);
+ if (!moveon)
+ goto out;
+ if (xfs_scrub_excessive_errors(ctx)) {
+ moveon = false;
+ goto out;
+ }
+ }
+
+ if (head->fmh_entries == 0)
+ break;
+ p = &head->fmh_recs[head->fmh_entries - 1];
+ if (p->fmr_flags & FMR_OF_LAST)
+ break;
+ fsmap_advance(head);
+ }
+
+ if (error) {
+ str_errno(ctx, descr);
+ moveon = false;
+ }
+out:
+ free(head);
+ return moveon;
+}
+
+/* Does the kernel support getfsmap? */
+bool
+xfs_can_iterate_fsmap(
+ struct scrub_ctx *ctx)
+{
+ struct fsmap_head head;
+ int error;
+
+ if (debug_tweak_on("XFS_SCRUB_NO_FSMAP"))
+ return false;
+
+ memset(&head, 0, sizeof(struct fsmap_head));
+ head.fmh_keys[1].fmr_device = UINT_MAX;
+ head.fmh_keys[1].fmr_physical = ULLONG_MAX;
+ head.fmh_keys[1].fmr_owner = ULLONG_MAX;
+ head.fmh_keys[1].fmr_offset = ULLONG_MAX;
+ error = ioctl(ctx->mnt_fd, FS_IOC_GETFSMAP, &head);
+ return error == 0 && (head.fmh_oflags & FMH_OF_DEV_T);
+}
+
+/* Online scrub and repair. */
+
+/* Type info and names for the scrub types. */
+enum scrub_type {
+ ST_NONE, /* disabled */
+ ST_AGHEADER, /* per-AG header */
+ ST_PERAG, /* per-AG metadata */
+ ST_FS, /* per-FS metadata */
+ ST_INODE, /* per-inode metadata */
+};
+struct scrub_descr {
+ const char *name;
+ enum scrub_type type;
+};
+
+/* These must correspond to XFS_SCRUB_TYPE_ */
+static const struct scrub_descr scrubbers[] = {
+ {"dummy", ST_NONE},
+ {"superblock", ST_AGHEADER},
+ {"free space header", ST_AGHEADER},
+ {"free list", ST_AGHEADER},
+ {"inode header", ST_AGHEADER},
+ {"freesp by block btree", ST_PERAG},
+ {"freesp by length btree", ST_PERAG},
+ {"inode btree", ST_PERAG},
+ {"free inode btree", ST_PERAG},
+ {"reverse mapping btree", ST_PERAG},
+ {"reference count btree", ST_PERAG},
+ {"inode record", ST_INODE},
+ {"data block map", ST_INODE},
+ {"attr block map", ST_INODE},
+ {"CoW block map", ST_INODE},
+ {"directory entries", ST_INODE},
+ {"extended attributes", ST_INODE},
+ {"symbolic link", ST_INODE},
+ {"realtime bitmap", ST_FS},
+ {"realtime summary", ST_FS},
+};
+
+/* Format a scrub description. */
+static void
+format_scrub_descr(
+ char *buf,
+ size_t buflen,
+ struct xfs_scrub_metadata *meta,
+ const struct scrub_descr *sc)
+{
+ switch (sc->type) {
+ case ST_AGHEADER:
+ case ST_PERAG:
+ snprintf(buf, buflen, _("AG %u %s"), meta->sm_agno,
+ _(sc->name));
+ break;
+ case ST_INODE:
+ snprintf(buf, buflen, _("Inode %llu %s"), meta->sm_ino,
+ _(sc->name));
+ break;
+ case ST_FS:
+ snprintf(buf, buflen, _("%s"), _(sc->name));
+ break;
+ case ST_NONE:
+ assert(0);
+ break;
+ }
+}
+
+static inline bool
+IS_CORRUPT(
+ __u32 flags)
+{
+ return flags & (XFS_SCRUB_FLAG_CORRUPT | XFS_SCRUB_FLAG_XCORRUPT);
+}
+
+/* Do we need to repair something? */
+static inline bool
+xfs_scrub_needs_repair(
+ struct xfs_scrub_metadata *sm)
+{
+ return IS_CORRUPT(sm->sm_flags);
+}
+
+/* Can we optimize something? */
+static inline bool
+xfs_scrub_needs_preen(
+ struct xfs_scrub_metadata *sm)
+{
+ return sm->sm_flags & XFS_SCRUB_FLAG_PREEN;
+}
+
+/* Do a read-only check of some metadata. */
+static enum check_outcome
+xfs_check_metadata(
+ struct scrub_ctx *ctx,
+ int fd,
+ struct xfs_scrub_metadata *meta,
+ bool is_inode)
+{
+ char buf[DESCR_BUFSZ];
+ int error;
+
+ assert(!debug_tweak_on("XFS_SCRUB_NO_KERNEL"));
+ assert(meta->sm_type <= XFS_SCRUB_TYPE_MAX);
+ format_scrub_descr(buf, DESCR_BUFSZ, meta, &scrubbers[meta->sm_type]);
+
+ dbg_printf("check %s flags %xh\n", buf, meta->sm_flags);
+
+ error = ioctl(fd, XFS_IOC_SCRUB_METADATA, meta);
+ if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR") && !error)
+ meta->sm_flags |= XFS_SCRUB_FLAG_PREEN;
+ if (error) {
+ /* Metadata not present, just skip it. */
+ if (errno == ENOENT)
+ return CHECK_DONE;
+ else if (errno == ESHUTDOWN) {
+ /* FS already crashed, give up. */
+ str_error(ctx, buf,
+_("Filesystem is shut down, aborting."));
+ return CHECK_ABORT;
+ }
+
+ /* Operational error. */
+ str_errno(ctx, buf);
+ return CHECK_DONE;
+ } else if (!xfs_scrub_needs_repair(meta) &&
+ !xfs_scrub_needs_preen(meta)) {
+ /* Clean operation, no corruption or preening detected. */
+ return CHECK_DONE;
+ } else if (xfs_scrub_needs_repair(meta) &&
+ ctx->mode < SCRUB_MODE_REPAIR) {
+ /* Corrupt, but we're not in repair mode. */
+ str_error(ctx, buf, _("Repairs are required."));
+ return CHECK_DONE;
+ } else if (xfs_scrub_needs_preen(meta) &&
+ ctx->mode < SCRUB_MODE_PREEN) {
+ /* Preenable, but we're not in preen mode. */
+ if (!is_inode) {
+ /* AG or FS metadata, always warn. */
+ str_info(ctx, buf, _("Optimization is possible."));
+ } else if (!ctx->preen_triggers[meta->sm_type]) {
+ /* File metadata, only warn once per type. */
+ pthread_mutex_lock(&ctx->lock);
+ if (!ctx->preen_triggers[meta->sm_type])
+ ctx->preen_triggers[meta->sm_type] = true;
+ pthread_mutex_unlock(&ctx->lock);
+ }
+ return CHECK_DONE;
+ }
+
+ return CHECK_REPAIR;
+}
+
+/* Bulk-notify user about things that could be optimized. */
+void
+xfs_scrub_report_preen_triggers(
+ struct scrub_ctx *ctx)
+{
+ int i;
+
+ for (i = 0; i <= XFS_SCRUB_TYPE_MAX; i++) {
+ pthread_mutex_lock(&ctx->lock);
+ if (ctx->preen_triggers[i]) {
+ ctx->preen_triggers[i] = false;
+ pthread_mutex_unlock(&ctx->lock);
+ str_info(ctx, ctx->mntpoint,
+_("Optimizations of %s are possible."), scrubbers[i].name);
+ } else {
+ pthread_mutex_unlock(&ctx->lock);
+ }
+ }
+}
+
+/* Scrub metadata, saving corruption reports for later. */
+static bool
+xfs_scrub_metadata(
+ struct scrub_ctx *ctx,
+ enum scrub_type scrub_type,
+ xfs_agnumber_t agno,
+ struct list_head *repair_list)
+{
+ struct xfs_scrub_metadata meta = {0};
+ const struct scrub_descr *sc;
+ struct repair_item *ri;
+ enum check_outcome fix;
+ int type;
+
+ sc = scrubbers;
+ for (type = 0; type <= XFS_SCRUB_TYPE_MAX; type++, sc++) {
+ if (sc->type != scrub_type)
+ continue;
+
+ meta.sm_type = type;
+ meta.sm_flags = 0;
+ meta.sm_agno = agno;
+
+ /* Check the item. */
+ fix = xfs_check_metadata(ctx, ctx->mnt_fd, &meta, false);
+ if (fix == CHECK_ABORT)
+ return false;
+ if (fix == CHECK_DONE)
+ continue;
+
+ /* Schedule this item for later repairs. */
+ ri = malloc(sizeof(struct repair_item));
+ if (!ri) {
+ str_errno(ctx, _("repair list"));
+ return false;
+ }
+ ri->op = meta;
+ list_add_tail(&ri->list, repair_list);
+ }
+
+ return true;
+}
+
+/* Scrub each AG's header blocks. */
+bool
+xfs_scrub_ag_headers(
+ struct scrub_ctx *ctx,
+ xfs_agnumber_t agno,
+ struct list_head *repair_list)
+{
+ return xfs_scrub_metadata(ctx, ST_AGHEADER, agno, repair_list);
+}
+
+/* Scrub each AG's metadata btrees. */
+bool
+xfs_scrub_ag_metadata(
+ struct scrub_ctx *ctx,
+ xfs_agnumber_t agno,
+ struct list_head *repair_list)
+{
+ return xfs_scrub_metadata(ctx, ST_PERAG, agno, repair_list);
+}
+
+/* Scrub whole-FS metadata btrees. */
+bool
+xfs_scrub_fs_metadata(
+ struct scrub_ctx *ctx,
+ struct list_head *repair_list)
+{
+ return xfs_scrub_metadata(ctx, ST_FS, 0, repair_list);
+}
+
+/* Scrub inode metadata. */
+static bool
+__xfs_scrub_file(
+ struct scrub_ctx *ctx,
+ uint64_t ino,
+ uint32_t gen,
+ int fd,
+ unsigned int type,
+ struct list_head *repair_list)
+{
+ struct xfs_scrub_metadata meta = {0};
+ struct repair_item *ri;
+ enum check_outcome fix;
+
+ assert(type <= XFS_SCRUB_TYPE_MAX);
+ assert(scrubbers[type].type == ST_INODE);
+
+ meta.sm_type = type;
+ meta.sm_ino = ino;
+ meta.sm_gen = gen;
+
+ /* Scrub the piece of metadata. */
+ fix = xfs_check_metadata(ctx, fd, &meta, true);
+ if (fix == CHECK_ABORT)
+ return false;
+ if (fix == CHECK_DONE)
+ return true;
+
+ /* Schedule this item for later repairs. */
+ ri = malloc(sizeof(struct repair_item));
+ if (!ri) {
+ str_errno(ctx, _("repair list"));
+ return false;
+ }
+ ri->op = meta;
+ list_add_tail(&ri->list, repair_list);
+ return true;
+}
+
+#define XFS_SCRUB_FILE_PART(name, flagname) \
+bool \
+xfs_scrub_##name( \
+ struct scrub_ctx *ctx, \
+ uint64_t ino, \
+ uint32_t gen, \
+ int fd, \
+ struct list_head *repair_list) \
+{ \
+ return __xfs_scrub_file(ctx, ino, gen, fd, XFS_SCRUB_TYPE_##flagname, \
+ repair_list); \
+}
+XFS_SCRUB_FILE_PART(inode_fields, INODE)
+XFS_SCRUB_FILE_PART(data_fork, BMBTD)
+XFS_SCRUB_FILE_PART(attr_fork, BMBTA)
+XFS_SCRUB_FILE_PART(cow_fork, BMBTC)
+XFS_SCRUB_FILE_PART(dir, DIR)
+XFS_SCRUB_FILE_PART(attr, XATTR)
+XFS_SCRUB_FILE_PART(symlink, SYMLINK)
+
+/*
+ * Prioritize repair items in order of how long we can wait.
+ * 0 = do it now, 10000 = do it later.
+ *
+ * To minimize the amount of repair work, we want to prioritize metadata
+ * objects by perceived corruptness. If CORRUPT is set, the fields are
+ * just plain bad; try fixing that first. Otherwise if XCORRUPT is set,
+ * the fields could be bad, but the xref data could also be bad; we'll
+ * try fixing that next. Finally, if XFAIL is set, some other metadata
+ * structure failed validation during xref, so we'll recheck this
+ * metadata last since it was probably fine.
+ *
+ * For metadata that lie in the critical path of checking other metadata
+ * (superblock, AG{F,I,FL}, inobt) we scrub and fix those things before
+ * we even get to handling their dependencies, so things should progress
+ * in order.
+ */
+static int
+PRIO(
+ struct xfs_scrub_metadata *op,
+ int order)
+{
+ if (op->sm_flags & XFS_SCRUB_FLAG_CORRUPT)
+ return order;
+ else if (op->sm_flags & XFS_SCRUB_FLAG_XCORRUPT)
+ return 100 + order;
+ else if (op->sm_flags & XFS_SCRUB_FLAG_XFAIL)
+ return 200 + order;
+ else if (op->sm_flags & XFS_SCRUB_FLAG_PREEN)
+ return 300 + order;
+ abort();
+}
+
+static int
+xfs_repair_item_priority(
+ struct repair_item *ri)
+{
+ switch (ri->op.sm_type) {
+ case XFS_SCRUB_TYPE_SB:
+ return PRIO(&ri->op, 0);
+ case XFS_SCRUB_TYPE_AGF:
+ return PRIO(&ri->op, 1);
+ case XFS_SCRUB_TYPE_AGFL:
+ return PRIO(&ri->op, 2);
+ case XFS_SCRUB_TYPE_AGI:
+ return PRIO(&ri->op, 3);
+ case XFS_SCRUB_TYPE_BNOBT:
+ case XFS_SCRUB_TYPE_CNTBT:
+ case XFS_SCRUB_TYPE_INOBT:
+ case XFS_SCRUB_TYPE_FINOBT:
+ case XFS_SCRUB_TYPE_REFCNTBT:
+ return PRIO(&ri->op, 4);
+ case XFS_SCRUB_TYPE_RMAPBT:
+ return PRIO(&ri->op, 5);
+ case XFS_SCRUB_TYPE_INODE:
+ return PRIO(&ri->op, 6);
+ case XFS_SCRUB_TYPE_BMBTD:
+ case XFS_SCRUB_TYPE_BMBTA:
+ case XFS_SCRUB_TYPE_BMBTC:
+ return PRIO(&ri->op, 7);
+ case XFS_SCRUB_TYPE_DIR:
+ case XFS_SCRUB_TYPE_XATTR:
+ case XFS_SCRUB_TYPE_SYMLINK:
+ return PRIO(&ri->op, 8);
+ case XFS_SCRUB_TYPE_RTBITMAP:
+ case XFS_SCRUB_TYPE_RTSUM:
+ return PRIO(&ri->op, 9);
+ }
+ abort();
+}
+
+/* Make sure that btrees get repaired before headers. */
+static int
+xfs_repair_item_compare(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct repair_item *ra;
+ struct repair_item *rb;
+
+ ra = container_of(a, struct repair_item, list);
+ rb = container_of(b, struct repair_item, list);
+
+ return xfs_repair_item_priority(ra) - xfs_repair_item_priority(rb);
+}
+
+/* Repair some metadata. */
+static enum check_outcome
+xfs_repair_metadata(
+ struct scrub_ctx *ctx,
+ int fd,
+ struct xfs_scrub_metadata *meta,
+ bool complain_if_still_broken)
+{
+ char buf[DESCR_BUFSZ];
+ __u32 oldf = meta->sm_flags;
+ int error;
+
+ assert(!debug_tweak_on("XFS_SCRUB_NO_KERNEL"));
+ meta->sm_flags |= XFS_SCRUB_FLAG_REPAIR;
+ assert(meta->sm_type <= XFS_SCRUB_TYPE_MAX);
+ format_scrub_descr(buf, DESCR_BUFSZ, meta, &scrubbers[meta->sm_type]);
+
+ if (xfs_scrub_needs_repair(meta))
+ str_info(ctx, buf, _("Attempting repair."));
+ else if (debug || verbose)
+ str_info(ctx, buf, _("Attempting optimization."));
+
+ error = ioctl(fd, XFS_IOC_SCRUB_METADATA, meta);
+ if (error) {
+ switch (errno) {
+ case ESHUTDOWN:
+ /* Filesystem is already shut down, abort. */
+ str_error(ctx, buf,
+_("Filesystem is shut down, aborting."));
+ return CHECK_ABORT;
+ case ENOTTY:
+ case EOPNOTSUPP:
+ /*
+ * If we forced repairs, don't complain if kernel
+ * doesn't know how to fix.
+ */
+ if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR"))
+ return CHECK_DONE;
+ /* fall through */
+ case EINVAL:
+ /* Kernel doesn't know how to repair this? */
+ if (complain_if_still_broken)
+ str_error(ctx, buf,
+_("Don't know how to fix; offline repair required."));
+ return CHECK_REPAIR;
+ case EROFS:
+ /* Read-only filesystem, can't fix. */
+ if (verbose || debug || IS_CORRUPT(oldf))
+ str_info(ctx, buf,
+_("Read-only filesystem; cannot make changes."));
+ return CHECK_DONE;
+ case ENOENT:
+ /* Metadata not present, just skip it. */
+ return CHECK_DONE;
+ case ENOMEM:
+ case ENOSPC:
+ /* Don't care if preen fails due to low resources. */
+ if (oldf & XFS_SCRUB_FLAG_PREEN)
+ return CHECK_DONE;
+ /* fall through */
+ default:
+ /* Operational error. */
+ str_errno(ctx, buf);
+ return CHECK_DONE;
+ }
+ } else if (xfs_scrub_needs_repair(meta)) {
+ /* Still broken, try again or fix offline. */
+ if (complain_if_still_broken)
+ str_error(ctx, buf,
+_("Repair unsuccessful; offline repair required."));
+ return CHECK_REPAIR;
+ } else {
+ /* Clean operation, no corruption detected. */
+ if (IS_CORRUPT(oldf))
+ record_repair(ctx, buf, _("Repairs successful."));
+ else
+ record_preen(ctx, buf, _("Optimization successful."));
+ return CHECK_DONE;
+ }
+}
+
+/* Repair everything on this list. */
+bool
+xfs_repair_metadata_list(
+ struct scrub_ctx *ctx,
+ int fd,
+ struct list_head *repair_list,
+ unsigned int flags)
+{
+ struct repair_item *ri;
+ struct repair_item *n;
+ enum check_outcome fix;
+
+ list_sort(NULL, repair_list, xfs_repair_item_compare);
+
+ list_for_each_entry_safe(ri, n, repair_list, list) {
+ if (!IS_CORRUPT(ri->op.sm_flags) &&
+ (flags & XRML_REPAIR_ONLY))
+ continue;
+ fix = xfs_repair_metadata(ctx, fd, &ri->op,
+ flags & XRML_NOFIX_COMPLAIN);
+ if (fix == CHECK_ABORT)
+ return false;
+ else if (fix == CHECK_REPAIR)
+ continue;
+
+ list_del(&ri->list);
+ free(ri);
+ }
+
+ return !xfs_scrub_excessive_errors(ctx);
+}
+
+/* Test the availability of a kernel scrub command. */
+static bool
+__xfs_scrub_test(
+ struct scrub_ctx *ctx,
+ unsigned int type)
+{
+ struct xfs_scrub_metadata meta = {0};
+ struct xfs_error_injection inject;
+ static bool injected;
+ int error;
+
+ if (debug_tweak_on("XFS_SCRUB_NO_KERNEL"))
+ return false;
+ if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR") && !injected) {
+ inject.fd = ctx->mnt_fd;
+#define XFS_ERRTAG_FORCE_REPAIR 28
+ inject.errtag = XFS_ERRTAG_FORCE_REPAIR;
+ error = ioctl(ctx->mnt_fd,
+ XFS_IOC_ERROR_INJECTION, &inject);
+ if (error == 0)
+ injected = true;
+ }
+
+ meta.sm_type = type;
+ error = ioctl(ctx->mnt_fd, XFS_IOC_SCRUB_METADATA, &meta);
+ return error == 0 || (error && errno != EOPNOTSUPP && errno != ENOTTY);
+}
+
+#define XFS_CAN_SCRUB_TEST(name, flagname) \
+bool \
+xfs_can_scrub_##name( \
+ struct scrub_ctx *ctx) \
+{ \
+ return __xfs_scrub_test(ctx, XFS_SCRUB_TYPE_##flagname); \
+}
+XFS_CAN_SCRUB_TEST(fs_metadata, SB)
+XFS_CAN_SCRUB_TEST(inode, INODE)
+XFS_CAN_SCRUB_TEST(bmap, BMBTD)
+XFS_CAN_SCRUB_TEST(dir, DIR)
+XFS_CAN_SCRUB_TEST(attr, XATTR)
+XFS_CAN_SCRUB_TEST(symlink, SYMLINK)
new file mode 100644
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef XFS_IOCTL_H_
+#define XFS_IOCTL_H_
+
+/* inode iteration */
+#define XFS_ITERATE_INODES_ABORT (-1)
+typedef int (*xfs_inode_iter_fn)(struct scrub_ctx *ctx,
+ struct xfs_handle *handle, struct xfs_bstat *bs, void *arg);
+bool xfs_iterate_inodes(struct scrub_ctx *ctx, const char *descr,
+ void *fshandle, uint64_t first_ino, uint64_t last_ino,
+ xfs_inode_iter_fn fn, void *arg);
+bool xfs_can_iterate_inodes(struct scrub_ctx *ctx);
+
+/* inode fork block mapping */
+struct xfs_bmap {
+ uint64_t bm_offset; /* file offset of segment in bytes */
+ uint64_t bm_physical; /* physical starting byte */
+ uint64_t bm_length; /* length of segment, bytes */
+ uint32_t bm_flags; /* output flags */
+};
+
+typedef bool (*xfs_bmap_iter_fn)(struct scrub_ctx *ctx, const char *descr,
+ int fd, int whichfork, struct fsxattr *fsx,
+ struct xfs_bmap *bmap, void *arg);
+
+bool xfs_iterate_bmap(struct scrub_ctx *ctx, const char *descr, int fd,
+ int whichfork, struct xfs_bmap *key, xfs_bmap_iter_fn fn,
+ void *arg);
+bool xfs_can_iterate_bmap(struct scrub_ctx *ctx);
+
+/* filesystem reverse mapping */
+typedef bool (*xfs_fsmap_iter_fn)(struct scrub_ctx *ctx, const char *descr,
+ struct fsmap *fsr, void *arg);
+bool xfs_iterate_fsmap(struct scrub_ctx *ctx, const char *descr,
+ struct fsmap *keys, xfs_fsmap_iter_fn fn, void *arg);
+bool xfs_can_iterate_fsmap(struct scrub_ctx *ctx);
+
+/* Online scrub and repair. */
+enum check_outcome {
+ CHECK_DONE,
+ CHECK_REPAIR,
+ CHECK_ABORT,
+};
+
+struct repair_item {
+ struct list_head list;
+ struct xfs_scrub_metadata op;
+};
+
+void xfs_scrub_report_preen_triggers(struct scrub_ctx *ctx);
+bool xfs_scrub_ag_headers(struct scrub_ctx *ctx, xfs_agnumber_t agno,
+ struct list_head *repair_list);
+bool xfs_scrub_ag_metadata(struct scrub_ctx *ctx, xfs_agnumber_t agno,
+ struct list_head *repair_list);
+bool xfs_scrub_fs_metadata(struct scrub_ctx *ctx,
+ struct list_head *repair_list);
+
+#define XRML_REPAIR_ONLY 1 /* no optimizations */
+#define XRML_NOFIX_COMPLAIN 2 /* complain if still corrupt */
+bool xfs_repair_metadata_list(struct scrub_ctx *ctx, int fd,
+ struct list_head *repair_list, unsigned int flags);
+
+bool xfs_can_scrub_fs_metadata(struct scrub_ctx *ctx);
+bool xfs_can_scrub_inode(struct scrub_ctx *ctx);
+bool xfs_can_scrub_bmap(struct scrub_ctx *ctx);
+bool xfs_can_scrub_dir(struct scrub_ctx *ctx);
+bool xfs_can_scrub_attr(struct scrub_ctx *ctx);
+bool xfs_can_scrub_symlink(struct scrub_ctx *ctx);
+
+bool xfs_scrub_inode_fields(struct scrub_ctx *ctx, uint64_t ino, uint32_t gen,
+ int fd, struct list_head *repair_list);
+bool xfs_scrub_data_fork(struct scrub_ctx *ctx, uint64_t ino, uint32_t gen,
+ int fd, struct list_head *repair_list);
+bool xfs_scrub_attr_fork(struct scrub_ctx *ctx, uint64_t ino, uint32_t gen,
+ int fd, struct list_head *repair_list);
+bool xfs_scrub_cow_fork(struct scrub_ctx *ctx, uint64_t ino, uint32_t gen,
+ int fd, struct list_head *repair_list);
+bool xfs_scrub_dir(struct scrub_ctx *ctx, uint64_t ino, uint32_t gen,
+ int fd, struct list_head *repair_list);
+bool xfs_scrub_attr(struct scrub_ctx *ctx, uint64_t ino, uint32_t gen,
+ int fd, struct list_head *repair_list);
+bool xfs_scrub_symlink(struct scrub_ctx *ctx, uint64_t ino, uint32_t gen,
+ int fd, struct list_head *repair_list);
+
+#endif /* XFS_IOCTL_H_ */