diff mbox

[v2,4/5] scrub userland implementation

Message ID 6321f041bfb9f439198cc03d2f7cb8ce8c5db867.1301503683.git.list.btrfs@jan-o-sch.net (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Schmidt March 30, 2011, 4:53 p.m. UTC
None

Comments

Hugo Mills July 10, 2011, 6:23 p.m. UTC | #1
Yes, this is over three months after the initial posting, but since
nobody else has looked at it yet, and the patch is in my integration
stack...

   I've not reviewed the whole thing -- just the "scrub start" code so
far. I've removed the bits I've not checked from the file below.

On Wed, Mar 30, 2011 at 06:53:12PM +0200, Jan Schmidt wrote:

   No commit message at all?

> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
> ---
>  scrub.c | 1568 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 1568 insertions(+), 0 deletions(-)

   This is quite big to review in one lump... Is it possible to split
the patch into functional sections? (Add shared infrastructure, then
each of the four functions separately, maybe?)

> diff --git a/scrub.c b/scrub.c
> new file mode 100644
> index 0000000..22052ed
> --- /dev/null
> +++ b/scrub.c
> @@ -0,0 +1,1568 @@

   It seems to be conventional to put a GPL notice at the top of
source files... :)

> +
> +#include <sys/ioctl.h>
> +#include <sys/wait.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +#include <poll.h>
> +#include <sys/file.h>
> +#include <uuid/uuid.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include <pthread.h>
> +#include <ctype.h>
> +#include <signal.h>
> +#include <stdarg.h>
> +
> +#include "ctree.h"
> +#include "ioctl.h"
> +#include "btrfs_cmds.h"
> +#include "utils.h"
> +#include "volumes.h"
> +#include "disk-io.h"
> +
> +#define SCRUB_DATA_FILE "/var/btrfs/scrub.status"
> +#define SCRUB_PROGRESS_SOCKET_PATH "/var/btrfs/scrub.progress"

   I'd suggest /var/lib/btrfs/[...] instead. Putting it in the top
level of /var seems a bit presumptuous (and contravenes the FHS).

> +#define SCRUB_FILE_VERSION_PREFIX "scrub status:"

   I'd drop the : from this, since there's no colons used in other key
names elsewhere (in scrub_read_file() and _scrub_kvread()), and the
correction for the colon at [1] is confusing.

> +#define SCRUB_FILE_VERSION "1"
> +
> +struct scrub_stats {
> +	time_t t_start;
> +	time_t t_resumed;
> +	u64 duration;
> +	u64 finished;
> +	u64 canceled;
> +};
> +
> +struct scrub_progress {
> +	struct btrfs_ioctl_scrub_args scrub_args;
> +	int fd;
> +	int ret;
> +	int skip;
> +	struct scrub_stats stats;
> +	struct scrub_file_record *resumed;
> +	int ioctl_errno;
> +	pthread_mutex_t progress_mutex;
> +};
> +
> +struct scrub_file_record {
> +	u8 fsid[BTRFS_FSID_SIZE];
> +	u64 devid;
> +	struct scrub_stats stats;
> +	struct btrfs_scrub_progress p;
> +};
> +
> +struct scrub_progress_cycle {
> +	int fdmnt;
> +	int prg_fd;
> +	int do_record;
> +	struct btrfs_ioctl_fs_info_args *fi;
> +	struct scrub_progress *progress;
> +	struct scrub_progress *shared_progress;
> +	pthread_mutex_t *write_mutex;
> +};
> +
> +struct scrub_fs_stat {
> +	struct btrfs_scrub_progress p;
> +	struct scrub_stats s;
> +	int i;
> +};
> +
> +static void print_scrub_full(struct btrfs_scrub_progress *sp)
> +{
> +	printf("\tdata_extents_scrubbed: %lld\n", sp->data_extents_scrubbed);
> +	printf("\ttree_extents_scrubbed: %lld\n", sp->tree_extents_scrubbed);
> +	printf("\tdata_bytes_scrubbed: %lld\n", sp->data_bytes_scrubbed);
> +	printf("\ttree_bytes_scrubbed: %lld\n", sp->tree_bytes_scrubbed);
> +	printf("\tread_errors: %lld\n", sp->read_errors);
> +	printf("\tcsum_errors: %lld\n", sp->csum_errors);
> +	printf("\tverify_errors: %lld\n", sp->verify_errors);
> +	printf("\tno_csum: %lld\n", sp->no_csum);
> +	printf("\tcsum_discards: %lld\n", sp->csum_discards);
> +	printf("\tsuper_errors: %lld\n", sp->super_errors);
> +	printf("\tmalloc_errors: %lld\n", sp->malloc_errors);
> +	printf("\tuncorrectable_errors: %lld\n", sp->uncorrectable_errors);
> +	printf("\tcorrected_errors: %lld\n", sp->corrected_errors);
> +	printf("\tlast_physical: %lld\n", sp->last_physical);
> +}
> +
> +#define err(test, ...) do {			\
> +	if (test)				\
> +		fprintf(stderr, __VA_ARGS__);	\
> +} while (0)
> +
> +#define PRINT_SCRUB_ERROR(test, desc) do {	\
> +	if (test)				\
> +		printf(" %s=%llu", desc, test);	\
> +} while (0)

   Extra line of space here, otherwise it looks like the function is
part of the macro. (And in a number of other places throughout the
file, too)

> +static void print_scrub_summary(struct btrfs_scrub_progress *p)
> +{
> +	u64 err_cnt;
> +	u64 err_cnt2;
> +
> +	err_cnt = p->read_errors +
> +			p->csum_errors +
> +			p->verify_errors +
> +			p->csum_discards +
> +			p->super_errors +
> +			p->malloc_errors;
> +
> +	err_cnt2 = p->corrected_errors + p->uncorrectable_errors;
> +
> +	printf("\ttotal bytes scrubbed: %s with %llu errors\n",
> +		pretty_sizes(p->data_bytes_scrubbed + p->tree_bytes_scrubbed),
> +		max(err_cnt, err_cnt2));

   Memory leak: pretty_sizes() mallocs space for its result.

> +	if (err_cnt || err_cnt2) {
> +		printf("\terror details:");
> +		PRINT_SCRUB_ERROR(p->read_errors, "read");
> +		PRINT_SCRUB_ERROR(p->super_errors, "super");
> +		PRINT_SCRUB_ERROR(p->malloc_errors, "malloc");
> +		PRINT_SCRUB_ERROR(p->verify_errors, "verify");
> +		PRINT_SCRUB_ERROR(p->csum_errors, "csum");
> +		PRINT_SCRUB_ERROR(p->csum_discards, "csum-discards");
> +		printf("\n");
> +		printf("\tcorrected errors: %llu, uncorrectable errors: %llu\n",
> +		       p->corrected_errors, p->uncorrectable_errors);
> +	}
> +}
> +
> +#define _SCRUB_FS_STAT(p, name, fs_stat) fs_stat->p.name += p->name

   checkpatch.pl says:
scrub.c:130: ERROR: Macros with complex values should be enclosed in parenthesis

(and in general, checkpatch.pl is whinging about lots of whitespace/
indentation issues with this file)

> +#define _SCRUB_FS_STAT_MIN(ss, name, fs_stat)	\
> +do {						\
> +	if (fs_stat->s.name > ss->name) {	\
> +		fs_stat->s.name = ss->name;	\
> +	}					\
> +} while (0)
> +#define _SCRUB_FS_STAT_ZMIN(ss, name, fs_stat)			\
> +do {								\
> +	if (!fs_stat->s.name || fs_stat->s.name > ss->name) {	\
> +		fs_stat->s.name = ss->name;			\
> +	}							\
> +} while (0)
> +#define _SCRUB_FS_STAT_MAX(ss, name, fs_stat)			\

   Maybe use _SCRUB_FS_STAT_ZMAX to match the ZMIN usage above?

> +do {								\
> +	if (!fs_stat->s.name || fs_stat->s.name < ss->name) {	\
> +		fs_stat->s.name = ss->name;			\
> +	}							\
> +} while (0)

   Line of space needed here.

> +static void add_to_fs_stat(struct btrfs_scrub_progress *p,
> +                           struct scrub_stats *ss,
> +                           struct scrub_fs_stat *fs_stat)
> +{
> +	_SCRUB_FS_STAT(p, data_extents_scrubbed, fs_stat);
> +	_SCRUB_FS_STAT(p, tree_extents_scrubbed, fs_stat);
> +	_SCRUB_FS_STAT(p, data_bytes_scrubbed, fs_stat);
> +	_SCRUB_FS_STAT(p, tree_bytes_scrubbed, fs_stat);
> +	_SCRUB_FS_STAT(p, read_errors, fs_stat);
> +	_SCRUB_FS_STAT(p, csum_errors, fs_stat);
> +	_SCRUB_FS_STAT(p, verify_errors, fs_stat);
> +	_SCRUB_FS_STAT(p, no_csum, fs_stat);
> +	_SCRUB_FS_STAT(p, csum_discards, fs_stat);
> +	_SCRUB_FS_STAT(p, super_errors, fs_stat);
> +	_SCRUB_FS_STAT(p, malloc_errors, fs_stat);
> +	_SCRUB_FS_STAT(p, uncorrectable_errors, fs_stat);
> +	_SCRUB_FS_STAT(p, corrected_errors, fs_stat);
> +	_SCRUB_FS_STAT(p, last_physical, fs_stat);
> +	_SCRUB_FS_STAT_ZMIN(ss, t_start, fs_stat);
> +	_SCRUB_FS_STAT_ZMIN(ss, t_resumed, fs_stat);
> +	_SCRUB_FS_STAT_MAX(ss, duration, fs_stat);
> +	_SCRUB_FS_STAT_MAX(ss, canceled, fs_stat);
> +	_SCRUB_FS_STAT_MIN(ss, finished, fs_stat);
> +}
> +
> +static void init_fs_stat(struct scrub_fs_stat *fs_stat)
> +{
> +	memset(fs_stat, 0, sizeof(*fs_stat));
> +	fs_stat->s.finished = 2;

   What does 2 mean? ->s.finished seems to be a boolean everywhere
except here. Can you turn this value into a more descriptive #define?
Or just use 1?

> +}
> +
> +static void _print_scrub_ss(struct scrub_stats *ss)
> +{
> +	char t[BTRFS_PATH_NAME_MAX+1];

   Since this string is used for storing formatted dates, there's a
little cognitive dissonance over it being long enough to store a btrfs
path name... (Yeah, OK, it's a convenient large value, but still a tad
confusing to use it here)

> +	struct tm tm;
> +
> +	if (!ss || !ss->t_start) {
> +		printf("\tno stats available\n");
> +		return;
> +	}
> +	if (ss->t_resumed) {
> +		localtime_r(&ss->t_resumed, &tm);
> +		strftime(t, sizeof(t), "%c", &tm);

   strftime doesn't append a terminating zero if the string is longer
than the buffer it's filling.

> +		printf("\tscrub resumed at %s", t);
> +	} else {
> +		localtime_r(&ss->t_start, &tm);
> +		strftime(t, sizeof(t), "%c", &tm);
> +		printf("\tscrub started at %s", t);
> +	}
> +	if (ss->finished && !ss->canceled) {
> +		printf(" and finished after %llu seconds\n",
> +		       ss->duration);
> +	} else if (ss->canceled) {
> +		printf(" and was aborted after %llu seconds\n",
> +		       ss->duration);
> +	} else {
> +		printf(", running for %llu seconds\n", ss->duration);
> +	}
> +}
> +
> +static void print_scrub_dev(struct btrfs_ioctl_dev_info_args *di,
> +                            struct btrfs_scrub_progress *p, int raw,
> +                            const char *append, struct scrub_stats *ss)
> +{
> +	printf("scrub device %s (id %llu) %s\n", di->path, di->devid,
> +	       append ? append : "");
> +
> +	_print_scrub_ss(ss);
> +
> +	if (p) {
> +		if (raw)
> +			print_scrub_full(p);
> +		else
> +			print_scrub_summary(p);
> +	}
> +}
> +
> +static void print_fs_stat(struct scrub_fs_stat *fs_stat, int raw)
> +{
> +	_print_scrub_ss(&fs_stat->s);
> +
> +	if (raw)
> +		print_scrub_full(&fs_stat->p);
> +	else
> +		print_scrub_summary(&fs_stat->p);
> +}
> +
> +static void free_history(struct scrub_file_record **last_scrubs)
> +{
> +	struct scrub_file_record **l = last_scrubs;
> +	if (!l)
> +		return;
> +	while (*l)
> +		free(*l++);
> +	free(last_scrubs);
> +}
> +
> +static int cancel_fd = -1;
> +static void scrub_sigint_record_progress(int signal)

   What does this function have to do with recording progress? Seems a
bit of a misnomer to me. (Call it scrub_sigint_cancel_scrub, maybe?)

> +{
> +	ioctl(cancel_fd, BTRFS_IOC_SCRUB_CANCEL, NULL);
> +}
> +
> +static int scrub_handle_sigint_parent(void)
> +{
> +	struct sigaction sa = {
> +		.sa_handler = SIG_IGN,
> +		.sa_flags = SA_RESTART,
> +	};
> +
> +	return sigaction(SIGINT, &sa, NULL);
> +}
> +
> +static int scrub_handle_sigint_child(int fd)
> +{
> +	struct sigaction sa = {
> +		.sa_handler = fd == -1 ? SIG_DFL : scrub_sigint_record_progress,
> +	};
> +
> +	cancel_fd = fd;
> +	return sigaction(SIGINT, &sa, NULL);
> +}
> +
> +static int _scrub_datafile(const char *fn_base, const char *fn_local,
> +                           const char *fn_tmp, char *datafile, int max)
> +{
> +	int ret;
> +
> +	strncpy(datafile, fn_base, max);

   You need to put a zero byte at datafile[max], otherwise it could be
unterminated (if max <= strlen(fn_base)), and the strlen will then run
off the end of the string.

> +	ret = strlen(datafile);
> +	
> +	if (ret + 1 >= max)
> +		return -EOVERFLOW;

   This will never happen (if you put the zero terminator in)

> +	datafile[ret] = '.';
> +	strncpy(datafile+ret+1, fn_local, max-ret-1);

   ... and add a zero byte here, too (or use strncat)

> +	ret = strlen(datafile);
> +
> +	if (ret + 1 >= max)
> +		return -EOVERFLOW;

   as above: won't happen

> +	if (fn_tmp) {
> +		datafile[ret] = '_';
> +		strncpy(datafile+ret+1, fn_tmp, max-ret-1);

   ... and add a zero byte here (or use strncat)

> +		ret = strlen(datafile);
> +
> +		if (ret >= max)
> +			return -EOVERFLOW;
> +	}
> +
> +	return 0;
> +}
> +
> +static int _scrub_open_file(const char *datafile, int m)

   Just a niggle: Why the leading _ when other scrub-specific
functions don't have it? (There's about a dozen other such symbols --
was there a criterion you used to decide which ones have _ and which
don't?)

> +{
> +	int fd;
> +	int ret;
> +
> +	fd = open(datafile, m, 0600);
> +	if (fd < 0)
> +		return -errno;
> +
> +	ret = flock(fd, LOCK_EX|LOCK_NB);
> +	if (ret) {
> +		ret = errno;
> +		close(fd);
> +		return -ret;
> +	}
> +
> +	return fd;
> +}
> +
> +static int scrub_open_file_r(const char *fn_base, const char *fn_local)
> +{
> +	int ret;
> +	char datafile[BTRFS_PATH_NAME_MAX+1];
> +	ret = _scrub_datafile(fn_base, fn_local, NULL,
> +	                      datafile, sizeof(datafile));
> +	if (ret < 0)
> +		return ret;
> +	return _scrub_open_file(datafile, O_RDONLY);
> +}
> +
> +static int scrub_open_file_w(const char *fn_base, const char *fn_local,
> +                             const char *tmp)
> +{
> +	int ret;
> +	char datafile[BTRFS_PATH_NAME_MAX+1];
> +	ret = _scrub_datafile(fn_base, fn_local, tmp,
> +	                      datafile, sizeof(datafile));
> +	if (ret < 0)
> +		return ret;
> +	return _scrub_open_file(datafile, O_WRONLY|O_CREAT);
> +}
> +
> +static int scrub_rename_file(const char *fn_base, const char *fn_local,
> +                             const char *tmp)
> +{
> +	int ret;
> +	char datafile_old[BTRFS_PATH_NAME_MAX+1];
> +	char datafile_new[BTRFS_PATH_NAME_MAX+1];
> +	ret = _scrub_datafile(fn_base, fn_local, tmp,
> +	                      datafile_old, sizeof(datafile_old));
> +	if (ret < 0)
> +		return ret;
> +	ret = _scrub_datafile(fn_base, fn_local, NULL,
> +	                      datafile_new, sizeof(datafile_new));
> +	if (ret < 0)
> +		return ret;
> +	ret = rename(datafile_old, datafile_new);
> +	return ret ? -errno : 0;
> +}
> +
> +#define _SCRUB_KVREAD(i, name, avail, l, dest) \
> +	_scrub_kvread(i, sizeof(#name), avail, l, #name, dest.name)
> +#define _SCRUB_KVREAD_STATS(i, name, avail, l, dest) \
> +	_scrub_kvread(i, sizeof(#name), avail, l, #name, dest->stats.name)
> +/*
> + * returns 0 if the key did not match (nothing was read)
> + *         1 if the key did match (success)
> + *        -1 if the key did match and an error occured
> + */
> +static int _scrub_kvread(int *i, int len, int avail, const char *buf,
> +                         const char *key, u64 *dest)
> +{
> +	int j;
> +
> +	if (*i+len+1 < avail && strncmp(&buf[*i], key, len-1) == 0) {
> +		*i += len-1;
> +		if (buf[*i] != ':') {
> +			return -1;
> +		}
> +		*i += 1;
> +		for (j=0; isdigit(buf[*i+j]) && *i+j < avail; ++j)
> +			;
> +		if (*i+j >= avail)
> +			return -1;
> +		*dest = atoll(&buf[*i]);
> +		*i += j;
> +		return 1;
> +	}
> +	
> +	return 0;
> +}
> +
> +#define _SCRUB_ILLEGAL do {						\

   I'd better call the police, then... :) I think you mean invalid (or
unexpected, or unparsable), not illegal. (and likewise in the message,
below, of course).

> +	if (report_errors) {						\
> +		fprintf(stderr, "WARNING: illegal data in line %d pos "	\
> +		        "%d state %d (near \"%.*s\") at %s:%d\n",	\
> +		        lineno, i, state, 20 > avail ? avail : 20, l+i,	\
> +		        __FILE__, __LINE__);				\
> +	}								\
> +	goto skip;							\
> +} while (0)

   Extra line of space here

> +static struct scrub_file_record **scrub_read_file(int fd, int report_errors)
> +{
> +	int avail = 0;
> +	int old_avail = 0;
> +	char l[512];
> +	int state = 0;
> +	int curr = -1;
> +	int i = 0;
> +	int j;
> +	int ret;
> +	int eof = 0;
> +	int lineno = 0;
> +	u64 version;
> +	char empty_uuid[BTRFS_FSID_SIZE] = {0};
> +	struct scrub_file_record **p = NULL;
> +
> +	if (fd < 0)
> +		return ERR_PTR(-EINVAL);
> +
> +again:
> +	old_avail = avail-i;
> +	BUG_ON(old_avail < 0);
> +	if (old_avail)
> +		memmove(l, l+i, old_avail);
> +	avail = read(fd, l+old_avail, sizeof(l)-old_avail);
> +	if (avail == 0) {
> +		eof = 1;
> +	}
> +	if (avail + old_avail == 0) {
> +		if (curr >= 0 &&
> +		    memcmp(p[curr]->fsid, empty_uuid, BTRFS_FSID_SIZE) == 0) {
> +			p[curr] = NULL;
> +		} else if (curr == -1) {
> +			p = ERR_PTR(-ENODATA);
> +		}
> +		return p;
> +	}
> +	if (avail == -1)
> +		return ERR_PTR(-errno);

   If avail == -1 (i.e. the read failed) and old_avail == 1
(i.e. there was only one character left in the buffer), then that
triggers the code in the previous if statement (avail+old_avail == 0)
as well.

> +	avail += old_avail;
> +
> +	i = 0;
> +	while (i < avail) {
> +		switch (state) {
> +		case 0: /* start if file */
                         of

> +			ret = _scrub_kvread(&i,
> +				sizeof(SCRUB_FILE_VERSION_PREFIX)-1, avail, l,

   [1] Drop the colon from SCRUB_FILE_VERSION_PREFIX and leave out the
-1 here.

> +				SCRUB_FILE_VERSION_PREFIX, &version);
> +			if (ret != 1)
> +				_SCRUB_ILLEGAL;
> +			if (version != atoll(SCRUB_FILE_VERSION))
> +				return ERR_PTR(-ENOTSUP);
> +			state = 6;
> +			continue;
> +		case 1: /* start of line, alloc */
> +			if (!eof && !memchr(l+i, '\n', avail-i))
> +				goto again;

   This will cause problems (an infinite loop), I think, if there's an
input line greater than 512 bytes in length -- you can't find a line
ending, so you go back to "again", read in zero bytes because you've
not consumed anything in your buffer, and come back here to discover
that there's no line ending...

> +			++lineno;
> +			if (curr > -1 && memcmp(p[curr]->fsid, empty_uuid,
> +			                        BTRFS_FSID_SIZE) == 0) {
> +				state = 2;
> +				continue;
> +			}
> +			++curr;
> +			p = realloc(p, (curr+2)*sizeof(*p));
> +			if (p)
> +				p[curr] = malloc(sizeof(**p));
> +			if (!p || !p[curr])
> +				return ERR_PTR(-errno);
> +			memset(p[curr], 0, sizeof(**p));
> +			p[curr+1] = NULL;
> +			++state;

   You probably need a comment here (and below, as appropriate) to
indicate that the lack of a continue or break is intended, and not a
bug.

> +		case 2: /* start of line, skip space */
> +			while (isspace(l[i]) && i<avail) {
> +				if (l[i] == '\n')
> +					++lineno;
> +				++i;
> +			}
> +			if (i >= avail || (!eof && !memchr(l+i, '\n', avail-i)))
> +				goto again;
> +			++state;
> +		case 3: /* read fsid */
> +			if (i == avail)
> +				continue;
> +			for (j=0; l[i+j] != ':' && i+j < avail; ++j)
> +				;
> +			if (i+j+1 >= avail)
> +				_SCRUB_ILLEGAL;

   Possibly a comment needed here to indicate that state 1 guarantees
a full line of text in the buffer, so hitting the end of the buffer
here is a fatal error. (It took me a while to work out why this case
was always an error).

> +			if (j != 36)
> +				_SCRUB_ILLEGAL;
> +			l[i+j] = '\0';
> +			ret = uuid_parse(l+i, p[curr]->fsid);
> +			if (ret)
> +				_SCRUB_ILLEGAL;
> +			i += j + 1;
> +			++state;
> +		case 4: /* read dev id */
> +			for (j=0; isdigit(l[i+j]) && i+j < avail; ++j)
> +				;
> +			if (!j || i+j+1 >= avail)

   j == 0 is clearer than !j here, IMO

> +				_SCRUB_ILLEGAL;
> +			p[curr]->devid = atoll(&l[i]);
> +			i += j + 1;

   Is there any reason that you couldn't just use strtoull here? We
know that the string is terminated with a \n (by the guarantee of
state 1), so strtoull will always finish within the buffer.

> +			++state;
> +		case 5: /* read key/value pair */
> +			ret = _SCRUB_KVREAD(&i, data_extents_scrubbed, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, data_extents_scrubbed, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, tree_extents_scrubbed, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, data_bytes_scrubbed, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, tree_bytes_scrubbed, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, read_errors, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, csum_errors, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, verify_errors, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, no_csum, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, csum_discards, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, super_errors, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, malloc_errors, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, uncorrectable_errors, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, corrected_errors, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, last_physical, avail,
> +			                    l, &p[curr]->p) ||
> +			      _SCRUB_KVREAD(&i, finished, avail,
> +			                    l, &p[curr]->stats) ||
> +			      _SCRUB_KVREAD(&i, t_start, avail,
> +			                    l, (u64*)&p[curr]->stats) ||
> +			      _SCRUB_KVREAD(&i, t_resumed, avail,
> +			                    l, (u64*)&p[curr]->stats) ||
> +			      _SCRUB_KVREAD(&i, duration, avail,
> +			                    l, (u64*)&p[curr]->stats) ||
> +			      _SCRUB_KVREAD(&i, canceled, avail,
> +			                    l, &p[curr]->stats);
> +			if (ret != 1)
> +				_SCRUB_ILLEGAL;

   If there's a syntax error in the parser (i.e. the matched key is
not followed by a colon, or we run out of data), then _SCRUB_KVREAD
returns -1, which is converted to 1 by the ||, and the error is
dropped silently.

> +			++state;
> +		case 6: /* after number */
> +			if (l[i] == '|') {
> +				state = 5;
> +			} else if (l[i] == '\n') {
> +				state = 1;
> +			} else {
> +				_SCRUB_ILLEGAL;
> +			}
> +			++i;
> +			continue;
> +		case 99: /* skip rest of line */
> +skip:
> +			state = 99;
> +			do {
> +				++i;
> +				if (l[i-1] == '\n') {
> +					state = 1;
> +					break;
> +				}
> +			} while (i < avail);
> +			continue;
> +		}
> +		BUG();
> +	}
> +	goto again;
> +}
> +#undef _SCRUB_ILLEGAL

[...]

> +static int scrub_write_file(int fd, const char *fsid,
> +                            struct scrub_progress* data, int n)
> +{
> +	int ret = 0;
> +	int i;
> +	char buf[1024];
> +	struct scrub_progress local;
> +	struct scrub_progress *use;
> +
> +	if (n < 1) {
> +		return -EINVAL;
> +	}
> +
> +	ret = _scrub_write_buf(fd, SCRUB_FILE_VERSION_PREFIX SCRUB_FILE_VERSION
> +	                       "\n", sizeof(SCRUB_FILE_VERSION_PREFIX)-1

   If you leave out the colon from SCRUB_FILE_VERSION_PREFIX, then you
don't need the -1 here. You can add a literal colon in the middle of
the string concatenation between the prefix and the version number.

> +	                       + sizeof(SCRUB_FILE_VERSION)-1 + 1);
> +	if (ret)
> +		return -EOVERFLOW;
> +

[...]

> +static struct scrub_file_record *last_dev_scrub(
> +		struct scrub_file_record *const *const past_scrubs, u64 devid)
> +{
> +	int i;
> +
> +	if (!past_scrubs || IS_ERR(past_scrubs))
> +		return NULL;
> +
> +	for (i=0; past_scrubs[i]; ++i)
> +		if (past_scrubs[i]->devid == devid)
> +			return past_scrubs[i];
> +
> +	return NULL;
> +}
> +
> +static int scrub_device_info(int fd, u64 devid,
> +			     struct btrfs_ioctl_dev_info_args *di_args)
> +{
> +	int ret;
> +
> +	di_args->devid = devid;
> +	memset(&di_args->uuid, '\0', sizeof(di_args->uuid));
> +
> +	ret = ioctl(fd, BTRFS_IOC_DEV_INFO, di_args);
> +	return ret ? -errno : 0;
> +}
> +
> +static int scrub_fs_info(int fd, char *path,
> +                         struct btrfs_ioctl_fs_info_args *fi_args,
> +                         struct btrfs_ioctl_dev_info_args **di_ret)
> +{
> +	int ret = 0;
> +	int ndevs = 0;
> +	int i = 1;
> +	struct btrfs_fs_devices* fs_devices_mnt = NULL;
> +	struct btrfs_ioctl_dev_info_args *di_args;
> +	char mp[BTRFS_PATH_NAME_MAX+1];
> +
> +	memset(fi_args, 0, sizeof(*fi_args));
> +
> +	ret = ioctl(fd, BTRFS_IOC_FS_INFO, fi_args);
> +	if (ret && errno == EINVAL) {
> +		/* path is no mounted btrfs. try if it's a device */
> +		ret = check_mounted_where(fd, path, mp, sizeof(mp),
> +		                          &fs_devices_mnt);
> +		if (!ret)
> +			return -EINVAL;
> +		fi_args->num_devices = 1;

   Is this a valid assumption? What happens if I pass just one device
of a multi-device FS to "btrfs scrub start"?

> +		fi_args->max_id = fs_devices_mnt->latest_devid;
> +		i = fs_devices_mnt->latest_devid;
> +		memcpy(fi_args->fsid, fs_devices_mnt->fsid, BTRFS_FSID_SIZE);
> +		close(fd);
> +		fd = open_file_or_dir(mp);
> +		if (fd < 0)
> +			return -errno;
> +	} else if (ret) {
> +		return -errno;
> +	}
> +
> +	if (!fi_args->num_devices)
> +		return 0;
> +
> +	di_args = *di_ret = malloc(fi_args->num_devices*sizeof(*di_args));
> +	if (!di_args)
> +		return -errno;
> +
> +	for (; i<=fi_args->max_id; ++i) {
> +		BUG_ON(ndevs >= fi_args->num_devices);
> +		ret = scrub_device_info(fd, i, &di_args[ndevs]);
> +		if (ret == -ENODEV)
> +			continue;
> +		if (ret)
> +			return ret;
> +		++ndevs;
> +	}
> +
> +	BUG_ON(ndevs == 0);
> +
> +	return 0;
> +}
> +
> +int mkdir_p(char *path)
> +{
> +	int i;
> +	int ret;
> +
> +	for (i=1; i<strlen(path); ++i) {
> +		if (path[i] != '/')
> +			continue;
> +		path[i] = '\0';
> +		ret = mkdir(path, 0777);
> +		if (ret && errno != EEXIST)
> +			return 1;
> +		path[i] = '/';
> +	}
> +
> +	return 0;
> +}
> +
> +static int scrub_start(int argc, char **argv, int resume)
> +{
> +	int fdmnt;
> +	int prg_fd = -1;
> +	int fdres = -1;
> +	int ret;
> +	pid_t pid;
> +	int c;
> +	int i;
> +	int err = 0;

   This clashes with the macro err(). OK, I know the compiler's clever
enough to disambiguate, but it leads to nastiness like [2], below.

> +	int print_raw = 0;
> +	char *path;
> +	int do_background = 1;
> +	int do_wait = 0;
> +	int do_print = 0;
> +	int do_quiet = 0;
> +	int do_record = 1;
> +	int readonly = 0;
> +	int do_stats_per_dev = 0;
> +	int n_start = 0;
> +	int n_skip = 0;
> +	int n_resume = 0;
> +	struct btrfs_ioctl_fs_info_args fi_args;
> +	struct btrfs_ioctl_dev_info_args *di_args = NULL;
> +	struct scrub_progress *sp = NULL;
> +	struct scrub_fs_stat fs_stat;
> +	struct timeval tv;
> +	struct sockaddr_un addr = {
> +		.sun_family = AF_UNIX,
> +	};
> +	pthread_t *t_devs = NULL;
> +	pthread_t t_prog;
> +	pthread_attr_t t_attr;
> +	struct scrub_file_record **past_scrubs = NULL;
> +	struct scrub_file_record *last_scrub = NULL;
> +	char *datafile = strdup(SCRUB_DATA_FILE);

   This is never freed.

> +	char fsid[37];

   Magic number. is there a #define in libuuid for this value?

> +	char sock_path[BTRFS_PATH_NAME_MAX+1] = "";
> +	struct scrub_progress_cycle spc;
> +	pthread_mutex_t spc_write_mutex = PTHREAD_MUTEX_INITIALIZER;
> +	void *terr;
> +	u64 devid;
> +
> +	optind = 1;
> +	while ((c = getopt(argc, argv, "BdqrR")) != -1) {
> +		switch(c) {
> +		case 'B':
> +			do_background = 0;
> +			do_wait = 1;
> +			do_print = 1;
> +			break;
> +		case 'd':
> +			do_stats_per_dev = 1;
> +			break;
> +		case 'q':
> +			do_quiet = 1;
> +			break;
> +		case 'r':
> +			readonly = 1;
> +			break;
> +		case 'R':
> +			print_raw = 1;
> +			break;
> +		case '?':
> +		default:
> +			fprintf(stderr, "ERROR: scrub args invalid.\n"
> +			                " -B  do not background (implies -W)\n"

   What's -W?

> +			                " -d  stats per device (-B only)\n"
> +			                " -q  quiet\n"
> +			                " -r  read only mode\n");
> +			return 1;
> +		}
> +	}
> +
> +	/* try to catch most error cases before forking */
> +
> +	spc.progress = NULL;
> +	if (do_quiet && do_print)
> +		do_print = 0;
> +
> +	if (mkdir_p(datafile)) {
> +		err(!do_quiet, "WARNING: cannot create scrub data "
> +			       "file, mkdir %s failed: %s. Status recording "
> +			       "disabled\n", datafile, strerror(errno));
> +		do_record = 0;
> +	}
> +
> +	path = argv[optind];

   No bounds check:

hrm@ruthven:btrfs-progs-unstable $ ./btrfs scrub start -B
ERROR: can't access '(null)'

> +	fdmnt = open_file_or_dir(path);
> +	if (fdmnt < 0) {
> +		err(!do_quiet, "ERROR: can't access '%s'\n", path);
> +		return 12;
> +	}
> +
> +	ret = scrub_fs_info(fdmnt, path, &fi_args, &di_args);
> +	if (ret) {
> +		err(!do_quiet, "ERROR: getting dev info for scrub failed: "
> +		    "%s\n", strerror(-ret));
> +		err = 1;
> +		goto out;
> +	}
> +	if (!fi_args.num_devices) {
> +		err(!do_quiet, "ERROR: no devices found\n");
> +		err = 1;
> +		goto out;
> +	}
> +
> +	uuid_unparse(fi_args.fsid, fsid);
> +	fdres = scrub_open_file_r(SCRUB_DATA_FILE, fsid);
> +	if (fdres < 0 && fdres != -ENOENT) {
> +		err(!do_quiet, "WARNING: failed to open status file: "
> +		    "%s\n", strerror(-fdres));
> +	} else if (fdres >= 0) {
> +		past_scrubs = scrub_read_file(fdres, !do_quiet);
> +		if (IS_ERR(past_scrubs))
> +			err(!do_quiet, "WARNING: failed to read status file: "
> +			    "%s\n", strerror(-PTR_ERR(past_scrubs)));
> +		close(fdres);
> +	}
> +
> +	t_devs = malloc(fi_args.num_devices*sizeof(*t_devs));
> +	sp = calloc(1, fi_args.num_devices*sizeof(*sp));

   Shouldn't that be calloc(fi_args.num_devices, sizeof(*sp)) ? (OK,
it doesn't make any particular difference, but it just seems odd to
keep a dog and bark yourself).

> +	spc.progress = calloc(1, fi_args.num_devices*2*sizeof(*spc.progress));

   Woof! (And why do we need twice as many progress markers as devices?)

> +	if (!t_devs || !sp || !spc.progress) {
> +		err(!do_quiet, "ERROR: scrub failed: %s", strerror(errno));
> +		err = 1;

   [2] Eugh. Calling what looks like a function, then assigning a
value to it. Can you call the variable something else? (Or make the
macro a more obvious macro: ERR() say?)

> +		goto out;
> +	}
> +
> +	ret = pthread_attr_init(&t_attr);
> +	if (ret) {
> +		err(!do_quiet, "ERROR: pthread_attr_init failed: %s\n",
> +		    strerror(ret));
> +		err = 1;
> +		goto out;
> +	}
> +
> +	for (i = 0; i < fi_args.num_devices; ++i) {
> +		devid = di_args[i].devid;
> +		ret = pthread_mutex_init(&sp[i].progress_mutex, NULL);
> +		if (ret) {
> +			err(!do_quiet, "ERROR: pthread_mutex_init failed: "
> +			    "%s\n", strerror(ret));
> +			err = 1;
> +			goto out;
> +		}
> +		last_scrub = last_dev_scrub(past_scrubs, devid);
> +		sp[i].scrub_args.devid = devid;
> +		sp[i].fd = fdmnt;
> +		if (resume && last_scrub && (last_scrub->stats.canceled ||
> +		                             !last_scrub->stats.finished)) {
> +			++n_resume;
> +			sp[i].scrub_args.start = last_scrub->p.last_physical;
> +			sp[i].resumed = last_scrub;
> +		} else if (resume) {
> +			++n_skip;
> +			sp[i].skip = 1;
> +			sp[i].resumed = last_scrub;
> +			continue;
> +		} else {
> +			++n_start;
> +			sp[i].scrub_args.start = 0ll;
> +			sp[i].resumed = NULL;
> +		}
> +		sp[i].skip = 0;
> +		sp[i].scrub_args.end = (u64)-1ll;
> +		sp[i].scrub_args.flags = readonly ? BTRFS_SCRUB_READONLY : 0;
> +	}
> +
> +	if (!n_start && !n_resume) {
> +		if (!do_quiet)
> +			printf("scrub: nothing to resume for %s, fsid %s\n",
> +			       path, fsid);
> +		err = 0;
> +		goto out;
> +	}
> +
> +	ret = prg_fd = socket(AF_UNIX, SOCK_STREAM, 0);
> +	while (ret != -1) {
> +		_scrub_datafile(SCRUB_PROGRESS_SOCKET_PATH, fsid,
> +				NULL, sock_path, sizeof(sock_path));
> +		/* ignore EOVERFLOW, as strncpy follows anyway */

   The name in sock_path could still be truncated on -EOVERFLOW,
though. Is that always safe?

> +		strncpy(addr.sun_path, sock_path,
> +			sizeof(addr.sun_path)-1);
> +		ret = bind(prg_fd, (struct sockaddr *)&addr, sizeof(addr));
> +		if (ret != -1 || errno != EADDRINUSE)
> +			break;

   If we failed to bind because the address was in use, is there much
point in trying to connect to the socket here?

> +		ret = connect(prg_fd, (struct sockaddr *)&addr, sizeof(addr));
> +		if (!ret || errno != ECONNREFUSED) {
> +			fprintf(stderr, "ERROR: scrub already running\n");
> +			close(prg_fd);
> +			goto out;
> +		}
> +		ret = unlink(sock_path);

   Under the right (wrong) set of circumstances, isn't this loop going
to busy-wait?

> +	}
> +	if (ret != -1) {
> +		ret = listen(prg_fd, 100);
> +	}
> +	if (ret == -1) {
> +		err(!do_quiet, "WARNING: failed to open the progress status "
> +		    "socket at %s: %s. Progress cannot be queried\n",
> +		    sock_path[0] ? sock_path : SCRUB_PROGRESS_SOCKET_PATH,
> +		    strerror(errno));
> +		if (prg_fd != -1) {
> +			close(prg_fd);
> +			prg_fd = -1;
> +			if (sock_path[0])
> +				unlink(sock_path);
> +		}
> +	}
> +
> +	if (do_record) {
> +		/* write all-zero progress file for a start */
> +		ret = scrub_write_progress(&spc_write_mutex, fsid, sp,
> +					   fi_args.num_devices);

   -HRM: Unchecked scrub_write_progress

> +		if (ret) {
> +			err(!do_quiet, "WARNING: failed to write the progress "
> +			    "status file: %s. Status recording disabled\n",
> +			    strerror(-ret));
> +			do_record = 0;
> +		}
> +	}
> +
> +	if (do_background) {
> +		pid = fork();
> +		if (pid == -1) {
> +			err(!do_quiet, "ERROR: cannot scrub, fork failed: "
> +			               "%s\n", strerror(errno));
> +			err = 1;
> +			goto out;
> +		}
> +
> +		if (pid) {
> +			int stat;
> +			scrub_handle_sigint_parent();
> +			if (!do_quiet)
> +				printf("scrub %s on %s, fsid %s (pid=%d)\n",
> +				       n_start ? "started" : "resumed",
> +				       path, fsid, pid);
> +			if (!do_wait) {
> +				err = 0;
> +				goto out;
> +			}
> +			ret = wait(&stat);
> +			if (ret != pid) {
> +				err(!do_quiet, "ERROR: wait failed: (ret=%d) "
> +				    "%s\n", ret, strerror(errno));
> +				err = 1;
> +				goto out;
> +			}
> +			if (!WIFEXITED(stat) || WEXITSTATUS(stat)) {
> +				err(!do_quiet, "ERROR: scrub process failed\n");
> +				err = WIFEXITED(stat) ? WEXITSTATUS(stat) : -1;
> +				goto out;
> +			}
> +			err = 0;
> +			goto out;
> +		}
> +	}
> +
> +	scrub_handle_sigint_child(fdmnt);
> +
> +	for (i = 0; i < fi_args.num_devices; ++i) {
> +		if (sp[i].skip) {
> +			sp[i].scrub_args.progress = sp[i].resumed->p;
> +			sp[i].stats = sp[i].resumed->stats;
> +			sp[i].ret = 0;
> +			sp[i].stats.finished = 1;
> +			continue;
> +		}
> +		devid = di_args[i].devid;
> +		gettimeofday(&tv, NULL);
> +		sp[i].stats.t_start = tv.tv_sec;
> +		ret = pthread_create(&t_devs[i], &t_attr, scrub_one_dev,&sp[i]);

   -HRM not checked scrub_one_dev()

> +		if (ret) {
> +			if (do_print)
> +				fprintf(stderr, "ERROR: creating "
> +				        "scrub_one_dev[%llu] thread failed: "
> +				        "%s\n", devid, strerror(ret));
> +			err = 1;
> +			goto out;
> +		}
> +	}
> +
> +	spc.fdmnt = fdmnt;
> +	spc.prg_fd = prg_fd;
> +	spc.do_record = do_record;
> +	spc.write_mutex = &spc_write_mutex;
> +	spc.shared_progress = sp;
> +	spc.fi = &fi_args;
> +	pthread_create(&t_prog, &t_attr, scrub_progress_cycle, &spc);
> +

   -HRM: Not checked: scrub_progress_cycle()

> +	err = 0;
> +	for (i = 0; i < fi_args.num_devices; ++i) {
> +		if (sp[i].skip)
> +			continue;
> +		devid = di_args[i].devid;
> +		ret = pthread_join(t_devs[i], NULL);
> +		if (ret) {
> +			if (do_print)
> +				fprintf(stderr, "ERROR: pthread_join failed "
> +				        "for scrub_one_dev[%llu]: %s\n", devid,
> +			                strerror(ret));
> +			err++;
> +			continue;
> +		}
> +		if (sp[i].ret && sp[i].ioctl_errno == ENODEV) {
> +			if (do_print)
> +				fprintf(stderr, "WARNING: device %lld not "
> +				        "present\n", devid);
> +			continue;
> +		}
> +		if (sp[i].ret && sp[i].ioctl_errno == ECANCELED) {
> +			err++;
> +		} else if (sp[i].ret) {
> +			if (do_print)
> +				fprintf(stderr, "ERROR: scrubbing %s failed "
> +				        "for device id %lld (%s)\n", path,
> +				        devid, strerror(sp[i].ioctl_errno));
> +			err++;
> +			continue;
> +		}
> +	}
> +
> +	if (do_print) {
> +		const char *append = "done";
> +		if (!do_stats_per_dev)
> +			init_fs_stat(&fs_stat);
> +		for (i = 0; i < fi_args.num_devices; ++i) {
> +			if (do_stats_per_dev) {
> +				print_scrub_dev(&di_args[i],
> +				                &sp[i].scrub_args.progress,
> +				                print_raw,
> +				                sp[i].ret ? "canceled" : "done",
> +				                &sp[i].stats);
> +			} else {
> +				if (sp[i].ret)
> +					append = "canceled";
> +				add_to_fs_stat(&sp[i].scrub_args.progress,
> +						&sp[i].stats, &fs_stat);
> +			}
> +		}
> +		if (!do_stats_per_dev) {
> +			printf("scrub %s for %s\n", append, fsid);
> +			print_fs_stat(&fs_stat, print_raw);
> +		}
> +	}
> +
> +	pthread_cancel(t_prog);
> +	ret = pthread_join(t_prog, &terr);

   Does this need to happen before the output above? Is there a
possible race between scrub_progress_cycle() and the stats gathering
code here? (I've not looked at scrub_progress_cycle() in detail yet,
so I don't know).

> +	if (do_print && terr && terr != PTHREAD_CANCELED) {
> +		fprintf(stderr, "ERROR: recording progress "
> +			"failed: %s\n", strerror(-PTR_ERR(terr)));
> +	}
> +
> +	if (do_record) {
> +		ret = scrub_write_progress(&spc_write_mutex, fsid, sp,
> +					   fi_args.num_devices);

   -HRM Not checked scrub_write_progress()

> +		if (ret && do_print) {
> +			fprintf(stderr, "ERROR: failed to record the result: "
> +				"%s\n", strerror(-ret));
> +		}
> +	}
> +
> +	scrub_handle_sigint_child(-1);
> +
> +out:
> +	free_history(past_scrubs);
> +	free(di_args);
> +	free(t_devs);
> +	free(sp);
> +	free(spc.progress);
> +	if (prg_fd > -1) {
> +		close(prg_fd);
> +		if (sock_path[0])
> +			unlink(sock_path);
> +	}
> +	close(fdmnt);
> +
> +	return !!err;
> +}
> +
> +int do_scrub_start(int argc, char **argv)
> +{
> +	return scrub_start(argc, argv, 0);
> +}
> +
> +int do_scrub_resume(int argc, char **argv)
> +{
> +	return scrub_start(argc, argv, 1);
> +}
[...]
Jan Schmidt July 11, 2011, 2:29 p.m. UTC | #2
On 10.07.2011 20:23, Hugo Mills wrote:
>    Yes, this is over three months after the initial posting, but since
> nobody else has looked at it yet, and the patch is in my integration
> stack...

... thanks!

>    I've not reviewed the whole thing -- just the "scrub start" code so
> far. I've removed the bits I've not checked from the file below.

I rebased the old branch I found to your current integration branch and
fixed up a most of what you mentioned. I'll not send a new version out
until after your complete review (or your statement that this is it or
your statement that you would rather going on reviewing the revised
version).

Things I ripped out are accepted and corrected without resistance.
Comments follow.

> On Wed, Mar 30, 2011 at 06:53:12PM +0200, Jan Schmidt wrote:
> 
>    No commit message at all?

Didn't know what to put there. Cover letter says it all. And as
mentioned, this is the initial implementation.

>> Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
>> ---
>>  scrub.c | 1568 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 files changed, 1568 insertions(+), 0 deletions(-)
> 
>    This is quite big to review in one lump... Is it possible to split
> the patch into functional sections? (Add shared infrastructure, then
> each of the four functions separately, maybe?)

Thought about that, but it doesn't make sense to me. It is the initial
implementation. A lot of the code is shared, thus adding one lump and
patching the patch with four small additional commits wouldn't help much.

>> diff --git a/scrub.c b/scrub.c
>> new file mode 100644
>> index 0000000..22052ed
>> --- /dev/null
>> +++ b/scrub.c
>> +#define SCRUB_DATA_FILE "/var/btrfs/scrub.status"
>> +#define SCRUB_PROGRESS_SOCKET_PATH "/var/btrfs/scrub.progress"
> 
>    I'd suggest /var/lib/btrfs/[...] instead. Putting it in the top
> level of /var seems a bit presumptuous (and contravenes the FHS).

I wasn't sure if I can expect /var/lib to be present anywhere btrfs
could run. But I changed it to what you suggested.

>> +	printf("\ttotal bytes scrubbed: %s with %llu errors\n",
>> +		pretty_sizes(p->data_bytes_scrubbed + p->tree_bytes_scrubbed),
>> +		max(err_cnt, err_cnt2));
> 
>    Memory leak: pretty_sizes() mallocs space for its result.

Pah... In a user space function of a run-once utility right before it
exits. But I fixed that one, just to please you :-)

>> +static void init_fs_stat(struct scrub_fs_stat *fs_stat)
>> +{
>> +	memset(fs_stat, 0, sizeof(*fs_stat));
>> +	fs_stat->s.finished = 2;
> 
>    What does 2 mean? ->s.finished seems to be a boolean everywhere
> except here. Can you turn this value into a more descriptive #define?
> Or just use 1?

Good question. I guess I once wanted to distinguish really finished
scrub runs from not-even-started ones. I changed it to 1 (which makes it
much more likely we'll need that distinction quite soon).

>> +static int cancel_fd = -1;
>> +static void scrub_sigint_record_progress(int signal)
> 
>    What does this function have to do with recording progress? Seems a
> bit of a misnomer to me. (Call it scrub_sigint_cancel_scrub, maybe?)

I added a comment and left the name unchanged.

>> +{
>> +	ioctl(cancel_fd, BTRFS_IOC_SCRUB_CANCEL, NULL);
>> +}
>> +
>> +static int scrub_handle_sigint_parent(void)
>> +{
>> +	struct sigaction sa = {
>> +		.sa_handler = SIG_IGN,
>> +		.sa_flags = SA_RESTART,
>> +	};
>> +
>> +	return sigaction(SIGINT, &sa, NULL);
>> +}
>> +
>> +static int scrub_handle_sigint_child(int fd)
>> +{
>> +	struct sigaction sa = {
>> +		.sa_handler = fd == -1 ? SIG_DFL : scrub_sigint_record_progress,
>> +	};
>> +
>> +	cancel_fd = fd;
>> +	return sigaction(SIGINT, &sa, NULL);
>> +}
>> +
>> +static int _scrub_datafile(const char *fn_base, const char *fn_local,
>> +                           const char *fn_tmp, char *datafile, int max)
>> +{
>> +	int ret;
>> +
>> +	strncpy(datafile, fn_base, max);
> 
>    You need to put a zero byte at datafile[max], otherwise it could be
> unterminated (if max <= strlen(fn_base)), and the strlen will then run
> off the end of the string.

Damn. strncpy is a mess. I want strlcpy.

I Modified the code another way. I rather return an error than throwing
away bytes and continue happily.

strncpy third arg always - 1, thus we always have a 0 byte at the end of
the buffer. I then compare strlen to the buffer size.

>> +	ret = strlen(datafile);
>> +	
>> +	if (ret + 1 >= max)
>> +		return -EOVERFLOW;
> 
>    This will never happen (if you put the zero terminator in)
> 
>> +	datafile[ret] = '.';
>> +	strncpy(datafile+ret+1, fn_local, max-ret-1);
> 
>    ... and add a zero byte here, too (or use strncat)
> 
>> +	ret = strlen(datafile);
>> +
>> +	if (ret + 1 >= max)
>> +		return -EOVERFLOW;
> 
>    as above: won't happen
> 
>> +	if (fn_tmp) {
>> +		datafile[ret] = '_';
>> +		strncpy(datafile+ret+1, fn_tmp, max-ret-1);
> 
>    ... and add a zero byte here (or use strncat)
> 
>> +		ret = strlen(datafile);
>> +
>> +		if (ret >= max)
>> +			return -EOVERFLOW;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int _scrub_open_file(const char *datafile, int m)
> 
>    Just a niggle: Why the leading _ when other scrub-specific
> functions don't have it? (There's about a dozen other such symbols --
> was there a criterion you used to decide which ones have _ and which
> don't?)

Looking at the function names today, I think the criterion was
"everything i/o based get's a _", which is rubbish. I dropped all the '_'s.

>> +static struct scrub_file_record **scrub_read_file(int fd, int report_errors)
>> +{
>> +	int avail = 0;
>> +	int old_avail = 0;
>> +	char l[512];
>> +	int state = 0;
>> +	int curr = -1;
>> +	int i = 0;
>> +	int j;
>> +	int ret;
>> +	int eof = 0;
>> +	int lineno = 0;
>> +	u64 version;
>> +	char empty_uuid[BTRFS_FSID_SIZE] = {0};
>> +	struct scrub_file_record **p = NULL;
>> +
>> +	if (fd < 0)
>> +		return ERR_PTR(-EINVAL);
>> +
>> +again:
>> +	old_avail = avail-i;
>> +	BUG_ON(old_avail < 0);
>> +	if (old_avail)
>> +		memmove(l, l+i, old_avail);
>> +	avail = read(fd, l+old_avail, sizeof(l)-old_avail);
>> +	if (avail == 0) {
>> +		eof = 1;
>> +	}
>> +	if (avail + old_avail == 0) {
>> +		if (curr >= 0 &&
>> +		    memcmp(p[curr]->fsid, empty_uuid, BTRFS_FSID_SIZE) == 0) {
>> +			p[curr] = NULL;
>> +		} else if (curr == -1) {
>> +			p = ERR_PTR(-ENODATA);
>> +		}
>> +		return p;
>> +	}
>> +	if (avail == -1)
>> +		return ERR_PTR(-errno);
> 
>    If avail == -1 (i.e. the read failed) and old_avail == 1
> (i.e. there was only one character left in the buffer), then that
> triggers the code in the previous if statement (avail+old_avail == 0)
> as well.
> 
>> +	avail += old_avail;
>> +
>> +	i = 0;
>> +	while (i < avail) {
>> +		switch (state) {
>> +		case 0: /* start if file */
>                          of
> 
>> +			ret = _scrub_kvread(&i,
>> +				sizeof(SCRUB_FILE_VERSION_PREFIX)-1, avail, l,
> 
>    [1] Drop the colon from SCRUB_FILE_VERSION_PREFIX and leave out the
> -1 here.
> 
>> +				SCRUB_FILE_VERSION_PREFIX, &version);
>> +			if (ret != 1)
>> +				_SCRUB_ILLEGAL;
>> +			if (version != atoll(SCRUB_FILE_VERSION))
>> +				return ERR_PTR(-ENOTSUP);
>> +			state = 6;
>> +			continue;
>> +		case 1: /* start of line, alloc */
>> +			if (!eof && !memchr(l+i, '\n', avail-i))
>> +				goto again;
> 
>    This will cause problems (an infinite loop), I think, if there's an
> input line greater than 512 bytes in length -- you can't find a line
> ending, so you go back to "again", read in zero bytes because you've
> not consumed anything in your buffer, and come back here to discover
> that there's no line ending...

No, it will complain about a too long line. We go to "again", read 0
characters (which is fine, returns 0), then set the eof flag (which is
kind of a bug) and stop processing. I.e., whenever we encounter a line
longer than sizeof(l), we tell this fact to the user and stop processing
for that file completely.

I increased that buffer to 16k, which sounds big enough for quite some
file format extensions.

>> +			++lineno;
>> +			if (curr > -1 && memcmp(p[curr]->fsid, empty_uuid,
>> +			                        BTRFS_FSID_SIZE) == 0) {
>> +				state = 2;
>> +				continue;
>> +			}
>> +			++curr;
>> +			p = realloc(p, (curr+2)*sizeof(*p));
>> +			if (p)
>> +				p[curr] = malloc(sizeof(**p));
>> +			if (!p || !p[curr])
>> +				return ERR_PTR(-errno);
>> +			memset(p[curr], 0, sizeof(**p));
>> +			p[curr+1] = NULL;
>> +			++state;
> 
>    You probably need a comment here (and below, as appropriate) to
> indicate that the lack of a continue or break is intended, and not a
> bug.
> 
>> +		case 2: /* start of line, skip space */
>> +			while (isspace(l[i]) && i<avail) {
>> +				if (l[i] == '\n')
>> +					++lineno;
>> +				++i;
>> +			}
>> +			if (i >= avail || (!eof && !memchr(l+i, '\n', avail-i)))
>> +				goto again;
>> +			++state;
>> +		case 3: /* read fsid */
>> +			if (i == avail)
>> +				continue;
>> +			for (j=0; l[i+j] != ':' && i+j < avail; ++j)
>> +				;
>> +			if (i+j+1 >= avail)
>> +				_SCRUB_ILLEGAL;
> 
>    Possibly a comment needed here to indicate that state 1 guarantees
> a full line of text in the buffer, so hitting the end of the buffer
> here is a fatal error. (It took me a while to work out why this case
> was always an error).

Comment added in "case 1", instead, because this holds true for each
_SCRUB_INVALID (let's call it invalid) in this block.

>> +			if (j != 36)
>> +				_SCRUB_ILLEGAL;
>> +			l[i+j] = '\0';
>> +			ret = uuid_parse(l+i, p[curr]->fsid);
>> +			if (ret)
>> +				_SCRUB_ILLEGAL;
>> +			i += j + 1;
>> +			++state;
>> +		case 4: /* read dev id */
>> +			for (j=0; isdigit(l[i+j]) && i+j < avail; ++j)
>> +				;
>> +			if (!j || i+j+1 >= avail)
> 
>    j == 0 is clearer than !j here, IMO
> 
>> +				_SCRUB_ILLEGAL;
>> +			p[curr]->devid = atoll(&l[i]);
>> +			i += j + 1;
> 
>    Is there any reason that you couldn't just use strtoull here? We
> know that the string is terminated with a \n (by the guarantee of
> state 1), so strtoull will always finish within the buffer.

I just found it way easier to use atoll. We already know the first
character really is a digit, so why bother with a more cumbersome function?

>> +			++state;
>> +		case 5: /* read key/value pair */
>> +			ret = _SCRUB_KVREAD(&i, data_extents_scrubbed, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, data_extents_scrubbed, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, tree_extents_scrubbed, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, data_bytes_scrubbed, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, tree_bytes_scrubbed, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, read_errors, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, csum_errors, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, verify_errors, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, no_csum, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, csum_discards, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, super_errors, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, malloc_errors, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, uncorrectable_errors, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, corrected_errors, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, last_physical, avail,
>> +			                    l, &p[curr]->p) ||
>> +			      _SCRUB_KVREAD(&i, finished, avail,
>> +			                    l, &p[curr]->stats) ||
>> +			      _SCRUB_KVREAD(&i, t_start, avail,
>> +			                    l, (u64*)&p[curr]->stats) ||
>> +			      _SCRUB_KVREAD(&i, t_resumed, avail,
>> +			                    l, (u64*)&p[curr]->stats) ||
>> +			      _SCRUB_KVREAD(&i, duration, avail,
>> +			                    l, (u64*)&p[curr]->stats) ||
>> +			      _SCRUB_KVREAD(&i, canceled, avail,
>> +			                    l, &p[curr]->stats);
>> +			if (ret != 1)
>> +				_SCRUB_ILLEGAL;
> 
>    If there's a syntax error in the parser (i.e. the matched key is
> not followed by a colon, or we run out of data), then _SCRUB_KVREAD
> returns -1, which is converted to 1 by the ||, and the error is
> dropped silently.

Oops. Fixed by defining 0 to be the success return value for
scrub_kvread (sounds better, anyway). The check then looks for ret != 0
instead.

>> +static int scrub_fs_info(int fd, char *path,
>> +                         struct btrfs_ioctl_fs_info_args *fi_args,
>> +                         struct btrfs_ioctl_dev_info_args **di_ret)
>> +{
>> +	int ret = 0;
>> +	int ndevs = 0;
>> +	int i = 1;
>> +	struct btrfs_fs_devices* fs_devices_mnt = NULL;
>> +	struct btrfs_ioctl_dev_info_args *di_args;
>> +	char mp[BTRFS_PATH_NAME_MAX+1];
>> +
>> +	memset(fi_args, 0, sizeof(*fi_args));
>> +
>> +	ret = ioctl(fd, BTRFS_IOC_FS_INFO, fi_args);
>> +	if (ret && errno == EINVAL) {
>> +		/* path is no mounted btrfs. try if it's a device */
>> +		ret = check_mounted_where(fd, path, mp, sizeof(mp),
>> +		                          &fs_devices_mnt);
>> +		if (!ret)
>> +			return -EINVAL;
>> +		fi_args->num_devices = 1;
> 
>    Is this a valid assumption? What happens if I pass just one device
> of a multi-device FS to "btrfs scrub start"?

You tell scrub to scrub that one device, then. At least for raid1, this
option makes sense.

>> +static int scrub_start(int argc, char **argv, int resume)
>> +{
>> +	int fdmnt;
>> +	int prg_fd = -1;
>> +	int fdres = -1;
>> +	int ret;
>> +	pid_t pid;
>> +	int c;
>> +	int i;
>> +	int err = 0;
> 
>    This clashes with the macro err(). OK, I know the compiler's clever
> enough to disambiguate, but it leads to nastiness like [2], below.

Okok. Macro is now ERR().

>> +	int print_raw = 0;
>> +	char *path;
>> +	int do_background = 1;
>> +	int do_wait = 0;
>> +	int do_print = 0;
>> +	int do_quiet = 0;
>> +	int do_record = 1;
>> +	int readonly = 0;
>> +	int do_stats_per_dev = 0;
>> +	int n_start = 0;
>> +	int n_skip = 0;
>> +	int n_resume = 0;
>> +	struct btrfs_ioctl_fs_info_args fi_args;
>> +	struct btrfs_ioctl_dev_info_args *di_args = NULL;
>> +	struct scrub_progress *sp = NULL;
>> +	struct scrub_fs_stat fs_stat;
>> +	struct timeval tv;
>> +	struct sockaddr_un addr = {
>> +		.sun_family = AF_UNIX,
>> +	};
>> +	pthread_t *t_devs = NULL;
>> +	pthread_t t_prog;
>> +	pthread_attr_t t_attr;
>> +	struct scrub_file_record **past_scrubs = NULL;
>> +	struct scrub_file_record *last_scrub = NULL;
>> +	char *datafile = strdup(SCRUB_DATA_FILE);
> 
>    This is never freed.
> 
>> +	char fsid[37];
> 
>    Magic number. is there a #define in libuuid for this value?

At least the man page of uuid_parse clearly states uuids have 36 bytes
plus a \0 in printf format. uuid/uuid.h doesn't contain such a constant.
But volumes.c, print-tree.c and others do it with 37, too.

>> +	char sock_path[BTRFS_PATH_NAME_MAX+1] = "";
>> +	struct scrub_progress_cycle spc;
>> +	pthread_mutex_t spc_write_mutex = PTHREAD_MUTEX_INITIALIZER;
>> +	void *terr;
>> +	u64 devid;
>> +
>> +	optind = 1;
>> +	while ((c = getopt(argc, argv, "BdqrR")) != -1) {
>> +		switch(c) {
>> +		case 'B':
>> +			do_background = 0;
>> +			do_wait = 1;
>> +			do_print = 1;
>> +			break;
>> +		case 'd':
>> +			do_stats_per_dev = 1;
>> +			break;
>> +		case 'q':
>> +			do_quiet = 1;
>> +			break;
>> +		case 'r':
>> +			readonly = 1;
>> +			break;
>> +		case 'R':
>> +			print_raw = 1;
>> +			break;
>> +		case '?':
>> +		default:
>> +			fprintf(stderr, "ERROR: scrub args invalid.\n"
>> +			                " -B  do not background (implies -W)\n"
> 
>    What's -W?

A development option that was removed before submitting v2 and is
removed for v3 now from that comment above as well.

>> +			                " -d  stats per device (-B only)\n"
>> +			                " -q  quiet\n"
>> +			                " -r  read only mode\n");
>> +			return 1;
>> +		}
>> +	}
>> +
>> +	/* try to catch most error cases before forking */
>> +
>> +	spc.progress = NULL;
>> +	if (do_quiet && do_print)
>> +		do_print = 0;
>> +
>> +	if (mkdir_p(datafile)) {
>> +		err(!do_quiet, "WARNING: cannot create scrub data "
>> +			       "file, mkdir %s failed: %s. Status recording "
>> +			       "disabled\n", datafile, strerror(errno));
>> +		do_record = 0;
>> +	}
>> +
>> +	path = argv[optind];
> 
>    No bounds check:
> 
> hrm@ruthven:btrfs-progs-unstable $ ./btrfs scrub start -B
> ERROR: can't access '(null)'
> 
>> +	fdmnt = open_file_or_dir(path);
>> +	if (fdmnt < 0) {
>> +		err(!do_quiet, "ERROR: can't access '%s'\n", path);
>> +		return 12;
>> +	}
>> +
>> +	ret = scrub_fs_info(fdmnt, path, &fi_args, &di_args);
>> +	if (ret) {
>> +		err(!do_quiet, "ERROR: getting dev info for scrub failed: "
>> +		    "%s\n", strerror(-ret));
>> +		err = 1;
>> +		goto out;
>> +	}
>> +	if (!fi_args.num_devices) {
>> +		err(!do_quiet, "ERROR: no devices found\n");
>> +		err = 1;
>> +		goto out;
>> +	}
>> +
>> +	uuid_unparse(fi_args.fsid, fsid);
>> +	fdres = scrub_open_file_r(SCRUB_DATA_FILE, fsid);
>> +	if (fdres < 0 && fdres != -ENOENT) {
>> +		err(!do_quiet, "WARNING: failed to open status file: "
>> +		    "%s\n", strerror(-fdres));
>> +	} else if (fdres >= 0) {
>> +		past_scrubs = scrub_read_file(fdres, !do_quiet);
>> +		if (IS_ERR(past_scrubs))
>> +			err(!do_quiet, "WARNING: failed to read status file: "
>> +			    "%s\n", strerror(-PTR_ERR(past_scrubs)));
>> +		close(fdres);
>> +	}
>> +
>> +	t_devs = malloc(fi_args.num_devices*sizeof(*t_devs));
>> +	sp = calloc(1, fi_args.num_devices*sizeof(*sp));
> 
>    Shouldn't that be calloc(fi_args.num_devices, sizeof(*sp)) ? (OK,
> it doesn't make any particular difference, but it just seems odd to
> keep a dog and bark yourself).

Sounds quite reasonable.

>> +	spc.progress = calloc(1, fi_args.num_devices*2*sizeof(*spc.progress));
> 
>    Woof! (And why do we need twice as many progress markers as devices?)

We need them for consistency reasons. Both are used alternately when
progress is recorded to make sure we never write a half baked record.

>> +	if (!t_devs || !sp || !spc.progress) {
>> +		err(!do_quiet, "ERROR: scrub failed: %s", strerror(errno));
>> +		err = 1;
> 
>    [2] Eugh. Calling what looks like a function, then assigning a
> value to it. Can you call the variable something else? (Or make the
> macro a more obvious macro: ERR() say?)
> 
>> +		goto out;
>> +	}
>> +
>> +	ret = pthread_attr_init(&t_attr);
>> +	if (ret) {
>> +		err(!do_quiet, "ERROR: pthread_attr_init failed: %s\n",
>> +		    strerror(ret));
>> +		err = 1;
>> +		goto out;
>> +	}
>> +
>> +	for (i = 0; i < fi_args.num_devices; ++i) {
>> +		devid = di_args[i].devid;
>> +		ret = pthread_mutex_init(&sp[i].progress_mutex, NULL);
>> +		if (ret) {
>> +			err(!do_quiet, "ERROR: pthread_mutex_init failed: "
>> +			    "%s\n", strerror(ret));
>> +			err = 1;
>> +			goto out;
>> +		}
>> +		last_scrub = last_dev_scrub(past_scrubs, devid);
>> +		sp[i].scrub_args.devid = devid;
>> +		sp[i].fd = fdmnt;
>> +		if (resume && last_scrub && (last_scrub->stats.canceled ||
>> +		                             !last_scrub->stats.finished)) {
>> +			++n_resume;
>> +			sp[i].scrub_args.start = last_scrub->p.last_physical;
>> +			sp[i].resumed = last_scrub;
>> +		} else if (resume) {
>> +			++n_skip;
>> +			sp[i].skip = 1;
>> +			sp[i].resumed = last_scrub;
>> +			continue;
>> +		} else {
>> +			++n_start;
>> +			sp[i].scrub_args.start = 0ll;
>> +			sp[i].resumed = NULL;
>> +		}
>> +		sp[i].skip = 0;
>> +		sp[i].scrub_args.end = (u64)-1ll;
>> +		sp[i].scrub_args.flags = readonly ? BTRFS_SCRUB_READONLY : 0;
>> +	}
>> +
>> +	if (!n_start && !n_resume) {
>> +		if (!do_quiet)
>> +			printf("scrub: nothing to resume for %s, fsid %s\n",
>> +			       path, fsid);
>> +		err = 0;
>> +		goto out;
>> +	}
>> +
>> +	ret = prg_fd = socket(AF_UNIX, SOCK_STREAM, 0);
>> +	while (ret != -1) {
>> +		_scrub_datafile(SCRUB_PROGRESS_SOCKET_PATH, fsid,
>> +				NULL, sock_path, sizeof(sock_path));
>> +		/* ignore EOVERFLOW, as strncpy follows anyway */
> 
>    The name in sock_path could still be truncated on -EOVERFLOW,
> though. Is that always safe?

Yeah, no, that wasn't nice. It's changed now.

>> +		strncpy(addr.sun_path, sock_path,
>> +			sizeof(addr.sun_path)-1);
>> +		ret = bind(prg_fd, (struct sockaddr *)&addr, sizeof(addr));
>> +		if (ret != -1 || errno != EADDRINUSE)
>> +			break;
> 
>    If we failed to bind because the address was in use, is there much
> point in trying to connect to the socket here?

Had to think about those lines for a moment, so I'll add a comment. It's
code that cares for an orphan socket file: try to connect, if that's
working, scrub must be running, so error out. If it doesn't, unlink the
file and loop.

>> +		ret = connect(prg_fd, (struct sockaddr *)&addr, sizeof(addr));
>> +		if (!ret || errno != ECONNREFUSED) {
>> +			fprintf(stderr, "ERROR: scrub already running\n");
>> +			close(prg_fd);
>> +			goto out;
>> +		}
>> +		ret = unlink(sock_path);
> 
>    Under the right (wrong) set of circumstances, isn't this loop going
> to busy-wait?

In general, the loop can't execute more than twice.

If you manage to recreate that socket file in exactly the same moment
over and over again, that's possible, yes. But you would have to do that
externally on your own, this can't happen if you only use btrfs scrub.

>> +	}
>> +	if (ret != -1) {
>> +		ret = listen(prg_fd, 100);
>> +	}
>> +	if (ret == -1) {
>> +		err(!do_quiet, "WARNING: failed to open the progress status "
>> +		    "socket at %s: %s. Progress cannot be queried\n",
>> +		    sock_path[0] ? sock_path : SCRUB_PROGRESS_SOCKET_PATH,
>> +		    strerror(errno));
>> +		if (prg_fd != -1) {
>> +			close(prg_fd);
>> +			prg_fd = -1;
>> +			if (sock_path[0])
>> +				unlink(sock_path);
>> +		}
>> +	}
>> +
>> +	if (do_record) {
>> +		/* write all-zero progress file for a start */
>> +		ret = scrub_write_progress(&spc_write_mutex, fsid, sp,
>> +					   fi_args.num_devices);
> 
>    -HRM: Unchecked scrub_write_progress
> 
>> +		if (ret) {
>> +			err(!do_quiet, "WARNING: failed to write the progress "
>> +			    "status file: %s. Status recording disabled\n",
>> +			    strerror(-ret));
>> +			do_record = 0;
>> +		}
>> +	}
>> +
>> +	if (do_background) {
>> +		pid = fork();
>> +		if (pid == -1) {
>> +			err(!do_quiet, "ERROR: cannot scrub, fork failed: "
>> +			               "%s\n", strerror(errno));
>> +			err = 1;
>> +			goto out;
>> +		}
>> +
>> +		if (pid) {
>> +			int stat;
>> +			scrub_handle_sigint_parent();
>> +			if (!do_quiet)
>> +				printf("scrub %s on %s, fsid %s (pid=%d)\n",
>> +				       n_start ? "started" : "resumed",
>> +				       path, fsid, pid);
>> +			if (!do_wait) {
>> +				err = 0;
>> +				goto out;
>> +			}
>> +			ret = wait(&stat);
>> +			if (ret != pid) {
>> +				err(!do_quiet, "ERROR: wait failed: (ret=%d) "
>> +				    "%s\n", ret, strerror(errno));
>> +				err = 1;
>> +				goto out;
>> +			}
>> +			if (!WIFEXITED(stat) || WEXITSTATUS(stat)) {
>> +				err(!do_quiet, "ERROR: scrub process failed\n");
>> +				err = WIFEXITED(stat) ? WEXITSTATUS(stat) : -1;
>> +				goto out;
>> +			}
>> +			err = 0;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	scrub_handle_sigint_child(fdmnt);
>> +
>> +	for (i = 0; i < fi_args.num_devices; ++i) {
>> +		if (sp[i].skip) {
>> +			sp[i].scrub_args.progress = sp[i].resumed->p;
>> +			sp[i].stats = sp[i].resumed->stats;
>> +			sp[i].ret = 0;
>> +			sp[i].stats.finished = 1;
>> +			continue;
>> +		}
>> +		devid = di_args[i].devid;
>> +		gettimeofday(&tv, NULL);
>> +		sp[i].stats.t_start = tv.tv_sec;
>> +		ret = pthread_create(&t_devs[i], &t_attr, scrub_one_dev,&sp[i]);
> 
>    -HRM not checked scrub_one_dev()
> 
>> +		if (ret) {
>> +			if (do_print)
>> +				fprintf(stderr, "ERROR: creating "
>> +				        "scrub_one_dev[%llu] thread failed: "
>> +				        "%s\n", devid, strerror(ret));
>> +			err = 1;
>> +			goto out;
>> +		}
>> +	}
>> +
>> +	spc.fdmnt = fdmnt;
>> +	spc.prg_fd = prg_fd;
>> +	spc.do_record = do_record;
>> +	spc.write_mutex = &spc_write_mutex;
>> +	spc.shared_progress = sp;
>> +	spc.fi = &fi_args;
>> +	pthread_create(&t_prog, &t_attr, scrub_progress_cycle, &spc);
>> +
> 
>    -HRM: Not checked: scrub_progress_cycle()
> 
>> +	err = 0;
>> +	for (i = 0; i < fi_args.num_devices; ++i) {
>> +		if (sp[i].skip)
>> +			continue;
>> +		devid = di_args[i].devid;

HRM: mark1

>> +		ret = pthread_join(t_devs[i], NULL);
>> +		if (ret) {
>> +			if (do_print)
>> +				fprintf(stderr, "ERROR: pthread_join failed "
>> +				        "for scrub_one_dev[%llu]: %s\n", devid,
>> +			                strerror(ret));
>> +			err++;
>> +			continue;
>> +		}
>> +		if (sp[i].ret && sp[i].ioctl_errno == ENODEV) {
>> +			if (do_print)
>> +				fprintf(stderr, "WARNING: device %lld not "
>> +				        "present\n", devid);
>> +			continue;
>> +		}
>> +		if (sp[i].ret && sp[i].ioctl_errno == ECANCELED) {
>> +			err++;
>> +		} else if (sp[i].ret) {
>> +			if (do_print)
>> +				fprintf(stderr, "ERROR: scrubbing %s failed "
>> +				        "for device id %lld (%s)\n", path,
>> +				        devid, strerror(sp[i].ioctl_errno));
>> +			err++;
>> +			continue;
>> +		}
>> +	}
>> +
>> +	if (do_print) {
>> +		const char *append = "done";
>> +		if (!do_stats_per_dev)
>> +			init_fs_stat(&fs_stat);
>> +		for (i = 0; i < fi_args.num_devices; ++i) {
>> +			if (do_stats_per_dev) {
>> +				print_scrub_dev(&di_args[i],
>> +				                &sp[i].scrub_args.progress,
>> +				                print_raw,
>> +				                sp[i].ret ? "canceled" : "done",
>> +				                &sp[i].stats);
>> +			} else {
>> +				if (sp[i].ret)
>> +					append = "canceled";
>> +				add_to_fs_stat(&sp[i].scrub_args.progress,
>> +						&sp[i].stats, &fs_stat);
>> +			}
>> +		}
>> +		if (!do_stats_per_dev) {
>> +			printf("scrub %s for %s\n", append, fsid);
>> +			print_fs_stat(&fs_stat, print_raw);
>> +		}
>> +	}
>> +
>> +	pthread_cancel(t_prog);
>> +	ret = pthread_join(t_prog, &terr);
> 
>    Does this need to happen before the output above? Is there a

No, it doesn't. Look at the line below the "mark1" mark above. (What a
sentence!) There we join all the worker threads and print their results.
After succeeding, all workers must be done, so the progress thread can't
do anything useful anymore.

> possible race between scrub_progress_cycle() and the stats gathering
> code here? (I've not looked at scrub_progress_cycle() in detail yet,
> so I don't know).

As I see it, no. In addition to the fact that there won't come any fresh
progress notification at this state, we have those progress fields twice
to ensure consistency together with the progress_mutex taken by
scrub_progress_cycle.

-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hugo Mills July 11, 2011, 8:45 p.m. UTC | #3
OK, here's the remainder of my comments for this file. Not much for
this bit -- just one comment about locking, a reminder, and an
observation.

On Wed, Mar 30, 2011 at 06:53:12PM +0200, Jan Schmidt wrote:
[...]

> +static int _scrub_write_buf(int fd, const void *data, int len)
> +{
> +	int ret;
> +	ret = write(fd, data, len);
> +	return ret - len;
> +}
> +
> +static int _scrub_writev(int fd, char *buf, int max, const char *fmt, ...)
> +				__attribute__ ((format (printf, 4, 5)));
> +static int _scrub_writev(int fd, char *buf, int max, const char *fmt, ...)
> +{
> +	int ret;
> +	va_list args;
> +	
> +	va_start(args, fmt);
> +	ret = vsnprintf(buf, max, fmt, args);
> +	va_end(args);
> +	if (ret >= max)
> +		return ret - max;
> +	return _scrub_write_buf(fd, buf, ret);
> +}
> +
> +#define _SCRUB_SUM(dest, data, name) dest->scrub_args.progress.name =	\
> +			data->resumed->p.name + data->scrub_args.progress.name
> +static struct scrub_progress *_scrub_resumed_stats(struct scrub_progress *data,
> +                                                   struct scrub_progress *dest)
> +{
> +	if (!data->resumed || data->skip)
> +		return data;
> +
> +	_SCRUB_SUM(dest, data, data_extents_scrubbed);
> +	_SCRUB_SUM(dest, data, tree_extents_scrubbed);
> +	_SCRUB_SUM(dest, data, data_bytes_scrubbed);
> +	_SCRUB_SUM(dest, data, tree_bytes_scrubbed);
> +	_SCRUB_SUM(dest, data, read_errors);
> +	_SCRUB_SUM(dest, data, csum_errors);
> +	_SCRUB_SUM(dest, data, verify_errors);
> +	_SCRUB_SUM(dest, data, no_csum);
> +	_SCRUB_SUM(dest, data, csum_discards);
> +	_SCRUB_SUM(dest, data, super_errors);
> +	_SCRUB_SUM(dest, data, malloc_errors);
> +	_SCRUB_SUM(dest, data, uncorrectable_errors);
> +	_SCRUB_SUM(dest, data, corrected_errors);
> +	_SCRUB_SUM(dest, data, last_physical);
> +	dest->stats.canceled = data->stats.canceled;
> +	dest->stats.finished = data->stats.finished;
> +	dest->stats.t_resumed = data->stats.t_start;
> +	dest->stats.t_start = data->resumed->stats.t_start;
> +	dest->stats.duration = data->resumed->stats.duration +
> +							data->stats.duration;
> +	dest->scrub_args.devid = data->scrub_args.devid;
> +	return dest;
> +}
> +
> +#define _SCRUB_KVWRITE(fd, buf, name, use) 		\
> +	_scrub_kvwrite(fd, buf, sizeof(buf), #name, 	\
> +	               use->scrub_args.progress.name)
> +#define _SCRUB_KVWRITE_STATS(fd, buf, name, use) 	\
> +	_scrub_kvwrite(fd, buf, sizeof(buf), #name, 	\
> +	               use->stats.name)
> +static int _scrub_kvwrite(int fd, char *buf, int max,
> +                          const char *key, u64 val)
> +{
> +	return _scrub_writev(fd, buf, max, "|%s:%lld", key, val);
> +}
> +
> +static int scrub_write_file(int fd, const char *fsid,
> +                            struct scrub_progress* data, int n)
> +{
> +	int ret = 0;
> +	int i;
> +	char buf[1024];
> +	struct scrub_progress local;
> +	struct scrub_progress *use;
> +
> +	if (n < 1) {
> +		return -EINVAL;
> +	}
> +
> +	ret = _scrub_write_buf(fd, SCRUB_FILE_VERSION_PREFIX SCRUB_FILE_VERSION
> +	                       "\n", sizeof(SCRUB_FILE_VERSION_PREFIX)-1
> +	                       + sizeof(SCRUB_FILE_VERSION)-1 + 1);
> +	if (ret)
> +		return -EOVERFLOW;
> +
> +	for (i=0; i<n; ++i) {
> +		use = _scrub_resumed_stats(&data[i], &local);
> +		if (_scrub_write_buf(fd, fsid, strlen(fsid)) ||
> +		    _scrub_write_buf(fd, ":", 1) ||
> +		    _scrub_writev(fd, buf, sizeof(buf), "%lld",
> +		                  use->scrub_args.devid) ||
> +		    _scrub_write_buf(fd, buf, ret) ||
> +		    _SCRUB_KVWRITE(fd, buf, data_extents_scrubbed, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, tree_extents_scrubbed, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, data_bytes_scrubbed, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, tree_bytes_scrubbed, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, read_errors, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, csum_errors, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, verify_errors, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, no_csum, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, csum_discards, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, super_errors, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, malloc_errors, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, uncorrectable_errors, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, corrected_errors, use) ||
> +		    _SCRUB_KVWRITE(fd, buf, last_physical, use) ||
> +		    _SCRUB_KVWRITE_STATS(fd, buf, t_start, use) ||
> +		    _SCRUB_KVWRITE_STATS(fd, buf, t_resumed, use) ||
> +		    _SCRUB_KVWRITE_STATS(fd, buf, duration, use) ||
> +		    _SCRUB_KVWRITE_STATS(fd, buf, canceled, use) ||
> +		    _SCRUB_KVWRITE_STATS(fd, buf, finished, use) ||
> +		    _scrub_write_buf(fd, "\n", 1)) {
> +			return -EOVERFLOW;
> +		}
> +	}
> +
> +	return 0;
> +}
> +#undef _SCRUB_KVWRITE
> +
> +static int scrub_write_progress(pthread_mutex_t *m, const char *fsid,
> +                                struct scrub_progress* data, int n)
> +{
> +	int ret;
> +	int fd;
> +	int old;
> +
> +	ret = pthread_mutex_lock(m);
> +	if (ret) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &old);

   Probably not massively important, but you don't check for the
return value of this call or its counterpart at the end of this
function.

> +	fd = scrub_open_file_w(SCRUB_DATA_FILE, fsid, "tmp");
> +	if (fd < 0) {
> +		ret = fd;
> +		goto out;
> +	}
> +	ret = scrub_write_file(fd, fsid, data, n);
> +	if (ret)
> +		goto out;

   This leaks the file handle fd.

> +	ret = scrub_rename_file(SCRUB_DATA_FILE, fsid, "tmp");
> +	if (ret)
> +		goto out;

   As does this.

> +	ret = close(fd);
> +	if (ret) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +out:
> +	if (ret) {
> +		pthread_mutex_unlock(m);
> +	} else {
> +		ret = pthread_mutex_unlock(m);
> +		if (ret)
> +			ret = -errno;
> +	}
> +
> +	pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &old);
> +
> +	return ret;
> +}
> +
> +static void *scrub_one_dev(void *ctx)
> +{
> +	struct scrub_progress *sp = ctx;
> +	int ret;
> +	struct timeval tv;
> +
> +	sp->stats.canceled = 0;
> +	sp->stats.duration = 0;
> +	sp->stats.finished = 0;
> +
> +	ret = ioctl(sp->fd, BTRFS_IOC_SCRUB, &sp->scrub_args);
> +	gettimeofday(&tv, NULL);
> +	sp->ret = ret;
> +	sp->stats.duration = tv.tv_sec - sp->stats.t_start;
> +	sp->stats.canceled = !!ret;
> +	sp->ioctl_errno = errno;
> +	ret = pthread_mutex_lock(&sp->progress_mutex);
> +	if (ret)
> +		return ERR_PTR(-errno);
> +	sp->stats.finished = 1;
> +	ret = pthread_mutex_unlock(&sp->progress_mutex);

   If you downgrade .finished to a plain int, rather than a u64, then
is this locking actually be needed? You have one place where the lock
is held to write a single value (which is atomic for an int, IIRC),
and you have another place where you hold the lock to read and compare
it. I don't see any problem with removing the lock for that.

> +	if (ret)
> +		return ERR_PTR(-errno);
> +	
> +
> +	return NULL;
> +}
> +
> +static void *progress_one_dev(void *ctx)
> +{
> +	struct scrub_progress *sp = ctx;
> +	
> +	sp->ret = ioctl(sp->fd, BTRFS_IOC_SCRUB_PROGRESS, &sp->scrub_args);
> +	sp->ioctl_errno = errno;
> +
> +	return NULL;
> +}
> +
> +static void *scrub_progress_cycle(void *ctx)
> +{
> +	int ret;
> +	int i;
> +	char fsid[37];
> +	struct scrub_progress *sp;
> +	struct scrub_progress *sp_last;
> +	struct scrub_progress *sp_shared;
> +	struct timeval tv;
> +	struct scrub_progress_cycle *spc = ctx;
> +	int ndev = spc->fi->num_devices;
> +	int this = 1;
> +	int last = 0;
> +	int peer_fd = -1;
> +	struct pollfd accept_poll_fd = {
> +		.fd = spc->prg_fd,
> +		.events = POLLIN,
> +		.revents = 0,
> +	};
> +	struct pollfd write_poll_fd = {
> +		.events = POLLOUT,
> +		.revents = 0,
> +	};
> +	struct sockaddr_un peer;
> +	socklen_t peer_size = sizeof(peer);
> +
> +	pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &ret);
> +	uuid_unparse(spc->fi->fsid, fsid);
> +
> +	for (i=0; i<ndev; ++i) {
> +		sp = &spc->progress[i];
> +		sp_last = &spc->progress[i+ndev];
> +		sp_shared = &spc->shared_progress[i];
> +		sp->scrub_args.devid = sp_last->scrub_args.devid =
> +						sp_shared->scrub_args.devid;
> +		sp->fd = sp_last->fd = spc->fdmnt;
> +		sp->stats.t_start = sp_last->stats.t_start =
> +						sp_shared->stats.t_start;
> +		sp->resumed = sp_last->resumed = sp_shared->resumed;
> +		sp->skip = sp_last->skip = sp_shared->skip;
> +		sp->stats.finished = sp_last->stats.finished =
> +						sp_shared->stats.finished;
> +	}
> +
> +	while (1) {
> +		ret = poll(&accept_poll_fd, 1, 5*1000);
> +		if (ret == -1)
> +			return ERR_PTR(-errno);
> +		if (ret)
> +			peer_fd = accept(spc->prg_fd, (struct sockaddr *)&peer,
> +					 &peer_size);
> +		gettimeofday(&tv, NULL);
> +		this = (this+1)%2;
> +		last = (last+1)%2;
> +		for (i=0; i<ndev; ++i) {
> +			sp = &spc->progress[this*ndev+i];
> +			sp_last = &spc->progress[last*ndev+i];
> +			sp_shared = &spc->shared_progress[i];
> +			if (sp->stats.finished) {
> +				continue;
> +			}
> +			progress_one_dev(sp);
> +			sp->stats.duration = tv.tv_sec - sp->stats.t_start;
> +			if (!sp->ret)
> +				continue;
> +			if (sp->ioctl_errno != ENOTCONN &&
> +			    sp->ioctl_errno != ENODEV)
> +				return ERR_PTR(-sp->ioctl_errno);
> +			/*
> +			 * scrub finished or device removed, check the
> +			 * finished flag. if unset, just use the last
> +			 * result we got for the current write and go
> +			 * on. flag should be set on next cycle, then.
> +			 */
> +			ret = pthread_mutex_lock(&sp_shared->progress_mutex);
> +			if (ret)
> +				return ERR_PTR(-errno);
> +			if (!sp_shared->stats.finished) {
> +				ret = pthread_mutex_unlock(
> +						&sp_shared->progress_mutex);
> +				if (ret)
> +					return ERR_PTR(-errno);
> +				memcpy(sp, sp_last, sizeof(*sp));
> +				continue;
> +			}
> +			ret = pthread_mutex_unlock(&sp_shared->progress_mutex);
> +			if (ret)
> +				return ERR_PTR(-errno);
> +			memcpy(sp, sp_shared, sizeof(*sp));
> +			memcpy(sp_last, sp_shared, sizeof(*sp));
> +		}
> +		if (peer_fd != -1) {
> +			write_poll_fd.fd = peer_fd;
> +			ret = poll(&write_poll_fd, 1, 0);
> +			if (ret == -1)
> +				return ERR_PTR(-errno);
> +			if (ret) {
> +				ret = scrub_write_file(
> +					peer_fd, fsid,
> +					&spc->progress[this*ndev], ndev);
> +				if (ret)
> +					return ERR_PTR(ret);
> +			}
> +			close(peer_fd);
> +			peer_fd = -1;
> +		}
> +		if (!spc->do_record)
> +			continue;
> +		ret = scrub_write_progress(spc->write_mutex, fsid,
> +		                           &spc->progress[this*ndev], ndev);
> +		if (ret)
> +			return ERR_PTR(ret);
> +	}
> +}

[...]

> +int do_scrub_cancel(int argc, char **argv)
> +{
> +	char *path = argv[1];
> +	int ret;
> +	int fdmnt;
> +	int err;
> +	char mp[BTRFS_PATH_NAME_MAX+1];
> +	struct btrfs_fs_devices* fs_devices_mnt = NULL;
> +
> +	fdmnt = open_file_or_dir(path);
> +	if (fdmnt < 0) {
> +		fprintf(stderr, "ERROR: scrub cancel failed\n");
> +		return 12;
> +	}
> +
> +again:
> +	ret = ioctl(fdmnt, BTRFS_IOC_SCRUB_CANCEL, NULL);
> +	err = errno;
> +	close(fdmnt);
> +
> +	if (ret && err == EINVAL) {
> +		/* path is no mounted btrfs. try if it's a device */
> +		ret = check_mounted_where(fdmnt, path, mp, sizeof(mp),
> +					  &fs_devices_mnt);
> +		close(fdmnt);
> +		if (ret) {
> +			fdmnt = open_file_or_dir(mp);
> +			if (fdmnt >= 0) {
> +				path = mp;
> +				goto again;
> +			}
> +		}
> +	}
> +
> +	if (ret) {
> +		fprintf(stderr, "ERROR: scrub cancel failed on %s: %s\n", path,
> +		        err == ENOTCONN ? "not running" : strerror(errno));
> +		return 1;
> +	}
> +
> +	printf("scrub cancelled\n");
> +
> +	return 0;
> +}
> +
> +int do_scrub_status(int argc, char **argv)
> +{
> +
> +	char *path;
> +	struct btrfs_ioctl_fs_info_args fi_args;
> +	struct btrfs_ioctl_dev_info_args *di_args = NULL;
> +	struct scrub_file_record **past_scrubs = NULL;
> +	struct scrub_file_record *last_scrub;
> +	struct scrub_fs_stat fs_stat;
> +	struct sockaddr_un addr = {
> +		.sun_family = AF_UNIX,
> +	};
> +	int ret;
> +	int fdmnt;
> +	int i;
> +	optind = 1;
> +	int print_raw = 0;
> +	int do_stats_per_dev = 0;
> +	char c;
> +	char fsid[37];
> +	int fdres = -1;
> +	int err = 0;
> +
> +	while ((c = getopt(argc, argv, "dR")) != -1) {
> +		switch(c) {
> +		case 'd':
> +			do_stats_per_dev = 1;
> +			break;
> +		case 'R':
> +			print_raw = 1;
> +			break;
> +		case '?':
> +		default:
> +			fprintf(stderr, "ERROR: scrub status args invalid.\n"
> +			                " -d  stats per device\n");
> +			return 1;
> +		}
> +	}
> +
> +	path = argv[optind];
> +
> +	fdmnt = open_file_or_dir(path);
> +	if (fdmnt < 0) {
> +		fprintf(stderr, "ERROR: can't access to '%s'\n", path);
> +		return 12;
> +	}
> +
> +	ret = scrub_fs_info(fdmnt, path, &fi_args, &di_args);
> +	if (ret) {
> +		fprintf(stderr, "ERROR: getting dev info for scrub failed: "
> +		        "%s\n", strerror(-ret));
> +		err = 1;
> +		goto out;
> +	}
> +	if (!fi_args.num_devices) {
> +		fprintf(stderr, "ERROR: no devices found\n");
> +		err = 1;
> +		goto out;
> +	}
> +
> +	uuid_unparse(fi_args.fsid, fsid);
> +
> +	fdres = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (fdres == -1) {
> +		fprintf(stderr, "ERROR: failed to create socket to "
> +			"receive progress information: %s\n",
> +			strerror(errno));
> +		err = 1;
> +		goto out;
> +	}
> +	_scrub_datafile(SCRUB_PROGRESS_SOCKET_PATH, fsid,
> +			NULL, addr.sun_path, sizeof(addr.sun_path)-1);
> +	/* ignore EOVERFLOW, just use shorter name and hope for the best */

   Same comment as in the previous mail about ignoring EOVERFLOW in
this code...

> +	ret = connect(fdres, (struct sockaddr *)&addr, sizeof(addr));
> +	if (ret == -1) {
> +		fdres = scrub_open_file_r(SCRUB_DATA_FILE, fsid);
> +		if (fdres < 0 && fdres != -ENOENT) {
> +			fprintf(stderr, "WARNING: failed to open status file: "
> +				"%s\n", strerror(-fdres));
> +			err = 1;
> +			goto out;
> +		}
> +	}
> +
> +	if (fdres >= 0) {
> +		past_scrubs = scrub_read_file(fdres, 1);
> +		if (IS_ERR(past_scrubs))
> +			fprintf(stderr, "WARNING: failed to read status: %s\n",
> +				strerror(-PTR_ERR(past_scrubs)));
> +	}
> +
> +	printf("scrub status for %s\n", fsid);
> +
> +	/*
> +	 * TODO: rather communicate with scrub process instead of
> +	 *       dumping the file stats for instant results
> +	 */
> +	if (do_stats_per_dev) {
> +		for (i = 0; i < fi_args.num_devices; ++i) {
> +			last_scrub = last_dev_scrub(past_scrubs,
> +			                            di_args[i].devid);
> +			if (!last_scrub) {
> +				print_scrub_dev(&di_args[i], NULL, print_raw,
> +				                NULL, NULL);
> +				continue;
> +			}
> +			print_scrub_dev(&di_args[i], &last_scrub->p, print_raw,
> +				        last_scrub->stats.finished ?
> +			                "history" : "status",
> +			                &last_scrub->stats);
> +		}
> +	} else {
> +		init_fs_stat(&fs_stat);
> +		for (i = 0; i < fi_args.num_devices; ++i) {
> +			last_scrub = last_dev_scrub(past_scrubs,
> +			                            di_args[i].devid);
> +			if (!last_scrub)
> +				continue;
> +			add_to_fs_stat(&last_scrub->p, &last_scrub->stats,
> +			               &fs_stat);
> +		}
> +		print_fs_stat(&fs_stat, print_raw);
> +	}
> +
> +out:
> +	free_history(past_scrubs);
> +	free(di_args);
> +	close(fdmnt);
> +	if (fdres > -1)
> +		close(fdres);
> +
> +	return err;
> +}

   Hugo.
Hugo Mills July 11, 2011, 8:57 p.m. UTC | #4
On Mon, Jul 11, 2011 at 04:29:24PM +0200, Jan Schmidt wrote:
> On 10.07.2011 20:23, Hugo Mills wrote:
> >    Yes, this is over three months after the initial posting, but since
> > nobody else has looked at it yet, and the patch is in my integration
> > stack...
> 
> ... thanks!
> 
> >    I've not reviewed the whole thing -- just the "scrub start" code so
> > far. I've removed the bits I've not checked from the file below.
> 
> I rebased the old branch I found to your current integration branch and
> fixed up a most of what you mentioned. I'll not send a new version out
> until after your complete review (or your statement that this is it or
> your statement that you would rather going on reviewing the revised
> version).

   Thanks. The other half has just gone out (with few comments).

> Things I ripped out are accepted and corrected without resistance.
> Comments follow.

   Only a couple of rejoinders below.

> > On Wed, Mar 30, 2011 at 06:53:12PM +0200, Jan Schmidt wrote:
[...]

> >> +		case 4: /* read dev id */
> >> +			for (j=0; isdigit(l[i+j]) && i+j < avail; ++j)
> >> +				;
> >> +			if (!j || i+j+1 >= avail)
> > 
> >    j == 0 is clearer than !j here, IMO
> > 
> >> +				_SCRUB_ILLEGAL;
> >> +			p[curr]->devid = atoll(&l[i]);
> >> +			i += j + 1;
> > 
> >    Is there any reason that you couldn't just use strtoull here? We
> > know that the string is terminated with a \n (by the guarantee of
> > state 1), so strtoull will always finish within the buffer.
> 
> I just found it way easier to use atoll. We already know the first
> character really is a digit, so why bother with a more cumbersome function?

   Ah, my mistake for not being clearer, I think: I was talking about
the for loop at the head of the state 4 code as well. That only exists
in order to find the end of the number read in by atoll, and strtoull
would do that for you.

[...]

> >> +	char fsid[37];
> > 
> >    Magic number. is there a #define in libuuid for this value?
> 
> At least the man page of uuid_parse clearly states uuids have 36 bytes
> plus a \0 in printf format. uuid/uuid.h doesn't contain such a constant.
> But volumes.c, print-tree.c and others do it with 37, too.

   OK, if that's conventional (and not defined in uuid.h), then go
with the magic number.

   Hugo.
Jan Schmidt July 12, 2011, 8:48 a.m. UTC | #5
On 11.07.2011 22:45, Hugo Mills wrote:
>    OK, here's the remainder of my comments for this file. Not much for
> this bit -- just one comment about locking, a reminder, and an
> observation.

Again, I ripped out the bits I simply corrected. My comments below.

> [...]
>
>> +static int scrub_write_progress(pthread_mutex_t *m, const char *fsid,
>> +                                struct scrub_progress* data, int n)
>> +{
>> +	int ret;
>> +	int fd;
>> +	int old;
>> +
>> +	ret = pthread_mutex_lock(m);
>> +	if (ret) {
>> +		ret = -errno;
>> +		goto out;
>> +	}
>> +
>> +	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &old);
> 
>    Probably not massively important, but you don't check for the
> return value of this call or its counterpart at the end of this
> function.

pthread_* return values where wrong throughout the code. Good that you
pointed at this one. It's all fixed now.

> [...]
>
>> +static void *scrub_one_dev(void *ctx)
>> +{
>> +	struct scrub_progress *sp = ctx;
>> +	int ret;
>> +	struct timeval tv;
>> +
>> +	sp->stats.canceled = 0;
>> +	sp->stats.duration = 0;
>> +	sp->stats.finished = 0;
>> +
>> +	ret = ioctl(sp->fd, BTRFS_IOC_SCRUB, &sp->scrub_args);
>> +	gettimeofday(&tv, NULL);
>> +	sp->ret = ret;
>> +	sp->stats.duration = tv.tv_sec - sp->stats.t_start;
>> +	sp->stats.canceled = !!ret;
>> +	sp->ioctl_errno = errno;
>> +	ret = pthread_mutex_lock(&sp->progress_mutex);
>> +	if (ret)
>> +		return ERR_PTR(-errno);
>> +	sp->stats.finished = 1;
>> +	ret = pthread_mutex_unlock(&sp->progress_mutex);
> 
>    If you downgrade .finished to a plain int, rather than a u64, then
> is this locking actually be needed? You have one place where the lock
> is held to write a single value (which is atomic for an int, IIRC),
> and you have another place where you hold the lock to read and compare
> it. I don't see any problem with removing the lock for that.

Conclusion first: I want to stick with the mutex. My reasoning:
- this is by no means time critical code
- the mutexes do the memory barriers required for synchronization
- using int instead of u64 would complicate the kvread macros

Thanks,
-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Schmidt July 12, 2011, 8:49 a.m. UTC | #6
On 11.07.2011 22:57, Hugo Mills wrote:
> On Mon, Jul 11, 2011 at 04:29:24PM +0200, Jan Schmidt wrote:
>> On 10.07.2011 20:23, Hugo Mills wrote:
>>>    Yes, this is over three months after the initial posting, but since
>>> nobody else has looked at it yet, and the patch is in my integration
>>> stack...
>>
>> ... thanks!
>>
>>>    I've not reviewed the whole thing -- just the "scrub start" code so
>>> far. I've removed the bits I've not checked from the file below.
>>
>> I rebased the old branch I found to your current integration branch and
>> fixed up a most of what you mentioned. I'll not send a new version out
>> until after your complete review (or your statement that this is it or
>> your statement that you would rather going on reviewing the revised
>> version).
> 
>    Thanks. The other half has just gone out (with few comments).

I'm through now, but I'll wait another day for you to protest on my
latest comments before sending the new version.

>> Things I ripped out are accepted and corrected without resistance.
>> Comments follow.
> 
>    Only a couple of rejoinders below.
> 
>>> On Wed, Mar 30, 2011 at 06:53:12PM +0200, Jan Schmidt wrote:
> [...]
> 
>>>> +		case 4: /* read dev id */
>>>> +			for (j=0; isdigit(l[i+j]) && i+j < avail; ++j)
>>>> +				;
>>>> +			if (!j || i+j+1 >= avail)
>>>
>>>    j == 0 is clearer than !j here, IMO
>>>
>>>> +				_SCRUB_ILLEGAL;
>>>> +			p[curr]->devid = atoll(&l[i]);
>>>> +			i += j + 1;
>>>
>>>    Is there any reason that you couldn't just use strtoull here? We
>>> know that the string is terminated with a \n (by the guarantee of
>>> state 1), so strtoull will always finish within the buffer.
>>
>> I just found it way easier to use atoll. We already know the first
>> character really is a digit, so why bother with a more cumbersome function?
> 
>    Ah, my mistake for not being clearer, I think: I was talking about
> the for loop at the head of the state 4 code as well. That only exists
> in order to find the end of the number read in by atoll, and strtoull
> would do that for you.

Alright, now I see your point. However, with strtoull I would have to
calculate with character pointers, whereas anything else uses direct
character access with i and j here. And I don't need the fancy bits of
strtoull, either. So I'd like to stick with atoll.

> [...]
> 
>>>> +	char fsid[37];
>>>
>>>    Magic number. is there a #define in libuuid for this value?
>>
>> At least the man page of uuid_parse clearly states uuids have 36 bytes
>> plus a \0 in printf format. uuid/uuid.h doesn't contain such a constant.
>> But volumes.c, print-tree.c and others do it with 37, too.
> 
>    OK, if that's conventional (and not defined in uuid.h), then go
> with the magic number.
> 
>    Hugo.
> 

Thanks,
-Jan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hugo Mills July 12, 2011, 9:44 a.m. UTC | #7
On Tue, Jul 12, 2011 at 10:49:59AM +0200, Jan Schmidt wrote:
> On 11.07.2011 22:57, Hugo Mills wrote:
> > On Mon, Jul 11, 2011 at 04:29:24PM +0200, Jan Schmidt wrote:
> >> On 10.07.2011 20:23, Hugo Mills wrote:
> >>>    Yes, this is over three months after the initial posting, but since
> >>> nobody else has looked at it yet, and the patch is in my integration
> >>> stack...
> >>
> >> ... thanks!
> >>
> >>>    I've not reviewed the whole thing -- just the "scrub start" code so
> >>> far. I've removed the bits I've not checked from the file below.
> >>
> >> I rebased the old branch I found to your current integration branch and
> >> fixed up a most of what you mentioned. I'll not send a new version out
> >> until after your complete review (or your statement that this is it or
> >> your statement that you would rather going on reviewing the revised
> >> version).
> > 
> >    Thanks. The other half has just gone out (with few comments).
> 
> I'm through now, but I'll wait another day for you to protest on my
> latest comments before sending the new version.
> 
> >> Things I ripped out are accepted and corrected without resistance.
> >> Comments follow.
> > 
> >    Only a couple of rejoinders below.
> > 
> >>> On Wed, Mar 30, 2011 at 06:53:12PM +0200, Jan Schmidt wrote:
> > [...]
> > 
> >>>> +		case 4: /* read dev id */
> >>>> +			for (j=0; isdigit(l[i+j]) && i+j < avail; ++j)
> >>>> +				;
> >>>> +			if (!j || i+j+1 >= avail)
> >>>
> >>>    j == 0 is clearer than !j here, IMO
> >>>
> >>>> +				_SCRUB_ILLEGAL;
> >>>> +			p[curr]->devid = atoll(&l[i]);
> >>>> +			i += j + 1;
> >>>
> >>>    Is there any reason that you couldn't just use strtoull here? We
> >>> know that the string is terminated with a \n (by the guarantee of
> >>> state 1), so strtoull will always finish within the buffer.
> >>
> >> I just found it way easier to use atoll. We already know the first
> >> character really is a digit, so why bother with a more cumbersome function?
> > 
> >    Ah, my mistake for not being clearer, I think: I was talking about
> > the for loop at the head of the state 4 code as well. That only exists
> > in order to find the end of the number read in by atoll, and strtoull
> > would do that for you.
> 
> Alright, now I see your point. However, with strtoull I would have to
> calculate with character pointers, whereas anything else uses direct
> character access with i and j here. And I don't need the fancy bits of
> strtoull, either. So I'd like to stick with atoll.

   OK, it's not something I feel massively strongly about. Stick with
atoll, then.

   Hugo.
diff mbox

Patch

diff --git a/scrub.c b/scrub.c
new file mode 100644
index 0000000..22052ed
--- /dev/null
+++ b/scrub.c
@@ -0,0 +1,1568 @@ 
+
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <poll.h>
+#include <sys/file.h>
+#include <uuid/uuid.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <ctype.h>
+#include <signal.h>
+#include <stdarg.h>
+
+#include "ctree.h"
+#include "ioctl.h"
+#include "btrfs_cmds.h"
+#include "utils.h"
+#include "volumes.h"
+#include "disk-io.h"
+
+#define SCRUB_DATA_FILE "/var/btrfs/scrub.status"
+#define SCRUB_PROGRESS_SOCKET_PATH "/var/btrfs/scrub.progress"
+#define SCRUB_FILE_VERSION_PREFIX "scrub status:"
+#define SCRUB_FILE_VERSION "1"
+
+struct scrub_stats {
+	time_t t_start;
+	time_t t_resumed;
+	u64 duration;
+	u64 finished;
+	u64 canceled;
+};
+
+struct scrub_progress {
+	struct btrfs_ioctl_scrub_args scrub_args;
+	int fd;
+	int ret;
+	int skip;
+	struct scrub_stats stats;
+	struct scrub_file_record *resumed;
+	int ioctl_errno;
+	pthread_mutex_t progress_mutex;
+};
+
+struct scrub_file_record {
+	u8 fsid[BTRFS_FSID_SIZE];
+	u64 devid;
+	struct scrub_stats stats;
+	struct btrfs_scrub_progress p;
+};
+
+struct scrub_progress_cycle {
+	int fdmnt;
+	int prg_fd;
+	int do_record;
+	struct btrfs_ioctl_fs_info_args *fi;
+	struct scrub_progress *progress;
+	struct scrub_progress *shared_progress;
+	pthread_mutex_t *write_mutex;
+};
+
+struct scrub_fs_stat {
+	struct btrfs_scrub_progress p;
+	struct scrub_stats s;
+	int i;
+};
+
+static void print_scrub_full(struct btrfs_scrub_progress *sp)
+{
+	printf("\tdata_extents_scrubbed: %lld\n", sp->data_extents_scrubbed);
+	printf("\ttree_extents_scrubbed: %lld\n", sp->tree_extents_scrubbed);
+	printf("\tdata_bytes_scrubbed: %lld\n", sp->data_bytes_scrubbed);
+	printf("\ttree_bytes_scrubbed: %lld\n", sp->tree_bytes_scrubbed);
+	printf("\tread_errors: %lld\n", sp->read_errors);
+	printf("\tcsum_errors: %lld\n", sp->csum_errors);
+	printf("\tverify_errors: %lld\n", sp->verify_errors);
+	printf("\tno_csum: %lld\n", sp->no_csum);
+	printf("\tcsum_discards: %lld\n", sp->csum_discards);
+	printf("\tsuper_errors: %lld\n", sp->super_errors);
+	printf("\tmalloc_errors: %lld\n", sp->malloc_errors);
+	printf("\tuncorrectable_errors: %lld\n", sp->uncorrectable_errors);
+	printf("\tcorrected_errors: %lld\n", sp->corrected_errors);
+	printf("\tlast_physical: %lld\n", sp->last_physical);
+}
+
+#define err(test, ...) do {			\
+	if (test)				\
+		fprintf(stderr, __VA_ARGS__);	\
+} while (0)
+
+#define PRINT_SCRUB_ERROR(test, desc) do {	\
+	if (test)				\
+		printf(" %s=%llu", desc, test);	\
+} while (0)
+static void print_scrub_summary(struct btrfs_scrub_progress *p)
+{
+	u64 err_cnt;
+	u64 err_cnt2;
+
+	err_cnt = p->read_errors +
+			p->csum_errors +
+			p->verify_errors +
+			p->csum_discards +
+			p->super_errors +
+			p->malloc_errors;
+
+	err_cnt2 = p->corrected_errors + p->uncorrectable_errors;
+
+	printf("\ttotal bytes scrubbed: %s with %llu errors\n",
+		pretty_sizes(p->data_bytes_scrubbed + p->tree_bytes_scrubbed),
+		max(err_cnt, err_cnt2));
+	if (err_cnt || err_cnt2) {
+		printf("\terror details:");
+		PRINT_SCRUB_ERROR(p->read_errors, "read");
+		PRINT_SCRUB_ERROR(p->super_errors, "super");
+		PRINT_SCRUB_ERROR(p->malloc_errors, "malloc");
+		PRINT_SCRUB_ERROR(p->verify_errors, "verify");
+		PRINT_SCRUB_ERROR(p->csum_errors, "csum");
+		PRINT_SCRUB_ERROR(p->csum_discards, "csum-discards");
+		printf("\n");
+		printf("\tcorrected errors: %llu, uncorrectable errors: %llu\n",
+		       p->corrected_errors, p->uncorrectable_errors);
+	}
+}
+
+#define _SCRUB_FS_STAT(p, name, fs_stat) fs_stat->p.name += p->name
+#define _SCRUB_FS_STAT_MIN(ss, name, fs_stat)	\
+do {						\
+	if (fs_stat->s.name > ss->name) {	\
+		fs_stat->s.name = ss->name;	\
+	}					\
+} while (0)
+#define _SCRUB_FS_STAT_ZMIN(ss, name, fs_stat)			\
+do {								\
+	if (!fs_stat->s.name || fs_stat->s.name > ss->name) {	\
+		fs_stat->s.name = ss->name;			\
+	}							\
+} while (0)
+#define _SCRUB_FS_STAT_MAX(ss, name, fs_stat)			\
+do {								\
+	if (!fs_stat->s.name || fs_stat->s.name < ss->name) {	\
+		fs_stat->s.name = ss->name;			\
+	}							\
+} while (0)
+static void add_to_fs_stat(struct btrfs_scrub_progress *p,
+                           struct scrub_stats *ss,
+                           struct scrub_fs_stat *fs_stat)
+{
+	_SCRUB_FS_STAT(p, data_extents_scrubbed, fs_stat);
+	_SCRUB_FS_STAT(p, tree_extents_scrubbed, fs_stat);
+	_SCRUB_FS_STAT(p, data_bytes_scrubbed, fs_stat);
+	_SCRUB_FS_STAT(p, tree_bytes_scrubbed, fs_stat);
+	_SCRUB_FS_STAT(p, read_errors, fs_stat);
+	_SCRUB_FS_STAT(p, csum_errors, fs_stat);
+	_SCRUB_FS_STAT(p, verify_errors, fs_stat);
+	_SCRUB_FS_STAT(p, no_csum, fs_stat);
+	_SCRUB_FS_STAT(p, csum_discards, fs_stat);
+	_SCRUB_FS_STAT(p, super_errors, fs_stat);
+	_SCRUB_FS_STAT(p, malloc_errors, fs_stat);
+	_SCRUB_FS_STAT(p, uncorrectable_errors, fs_stat);
+	_SCRUB_FS_STAT(p, corrected_errors, fs_stat);
+	_SCRUB_FS_STAT(p, last_physical, fs_stat);
+	_SCRUB_FS_STAT_ZMIN(ss, t_start, fs_stat);
+	_SCRUB_FS_STAT_ZMIN(ss, t_resumed, fs_stat);
+	_SCRUB_FS_STAT_MAX(ss, duration, fs_stat);
+	_SCRUB_FS_STAT_MAX(ss, canceled, fs_stat);
+	_SCRUB_FS_STAT_MIN(ss, finished, fs_stat);
+}
+
+static void init_fs_stat(struct scrub_fs_stat *fs_stat)
+{
+	memset(fs_stat, 0, sizeof(*fs_stat));
+	fs_stat->s.finished = 2;
+}
+
+static void _print_scrub_ss(struct scrub_stats *ss)
+{
+	char t[BTRFS_PATH_NAME_MAX+1];
+	struct tm tm;
+
+	if (!ss || !ss->t_start) {
+		printf("\tno stats available\n");
+		return;
+	}
+	if (ss->t_resumed) {
+		localtime_r(&ss->t_resumed, &tm);
+		strftime(t, sizeof(t), "%c", &tm);
+		printf("\tscrub resumed at %s", t);
+	} else {
+		localtime_r(&ss->t_start, &tm);
+		strftime(t, sizeof(t), "%c", &tm);
+		printf("\tscrub started at %s", t);
+	}
+	if (ss->finished && !ss->canceled) {
+		printf(" and finished after %llu seconds\n",
+		       ss->duration);
+	} else if (ss->canceled) {
+		printf(" and was aborted after %llu seconds\n",
+		       ss->duration);
+	} else {
+		printf(", running for %llu seconds\n", ss->duration);
+	}
+}
+
+static void print_scrub_dev(struct btrfs_ioctl_dev_info_args *di,
+                            struct btrfs_scrub_progress *p, int raw,
+                            const char *append, struct scrub_stats *ss)
+{
+	printf("scrub device %s (id %llu) %s\n", di->path, di->devid,
+	       append ? append : "");
+
+	_print_scrub_ss(ss);
+
+	if (p) {
+		if (raw)
+			print_scrub_full(p);
+		else
+			print_scrub_summary(p);
+	}
+}
+
+static void print_fs_stat(struct scrub_fs_stat *fs_stat, int raw)
+{
+	_print_scrub_ss(&fs_stat->s);
+
+	if (raw)
+		print_scrub_full(&fs_stat->p);
+	else
+		print_scrub_summary(&fs_stat->p);
+}
+
+static void free_history(struct scrub_file_record **last_scrubs)
+{
+	struct scrub_file_record **l = last_scrubs;
+	if (!l)
+		return;
+	while (*l)
+		free(*l++);
+	free(last_scrubs);
+}
+
+static int cancel_fd = -1;
+static void scrub_sigint_record_progress(int signal)
+{
+	ioctl(cancel_fd, BTRFS_IOC_SCRUB_CANCEL, NULL);
+}
+
+static int scrub_handle_sigint_parent(void)
+{
+	struct sigaction sa = {
+		.sa_handler = SIG_IGN,
+		.sa_flags = SA_RESTART,
+	};
+
+	return sigaction(SIGINT, &sa, NULL);
+}
+
+static int scrub_handle_sigint_child(int fd)
+{
+	struct sigaction sa = {
+		.sa_handler = fd == -1 ? SIG_DFL : scrub_sigint_record_progress,
+	};
+
+	cancel_fd = fd;
+	return sigaction(SIGINT, &sa, NULL);
+}
+
+static int _scrub_datafile(const char *fn_base, const char *fn_local,
+                           const char *fn_tmp, char *datafile, int max)
+{
+	int ret;
+
+	strncpy(datafile, fn_base, max);
+	ret = strlen(datafile);
+	
+	if (ret + 1 >= max)
+		return -EOVERFLOW;
+	
+	datafile[ret] = '.';
+	strncpy(datafile+ret+1, fn_local, max-ret-1);
+	ret = strlen(datafile);
+
+	if (ret + 1 >= max)
+		return -EOVERFLOW;
+
+	if (fn_tmp) {
+		datafile[ret] = '_';
+		strncpy(datafile+ret+1, fn_tmp, max-ret-1);
+		ret = strlen(datafile);
+
+		if (ret >= max)
+			return -EOVERFLOW;
+	}
+
+	return 0;
+}
+
+static int _scrub_open_file(const char *datafile, int m)
+{
+	int fd;
+	int ret;
+
+	fd = open(datafile, m, 0600);
+	if (fd < 0)
+		return -errno;
+
+	ret = flock(fd, LOCK_EX|LOCK_NB);
+	if (ret) {
+		ret = errno;
+		close(fd);
+		return -ret;
+	}
+
+	return fd;
+}
+
+static int scrub_open_file_r(const char *fn_base, const char *fn_local)
+{
+	int ret;
+	char datafile[BTRFS_PATH_NAME_MAX+1];
+	ret = _scrub_datafile(fn_base, fn_local, NULL,
+	                      datafile, sizeof(datafile));
+	if (ret < 0)
+		return ret;
+	return _scrub_open_file(datafile, O_RDONLY);
+}
+
+static int scrub_open_file_w(const char *fn_base, const char *fn_local,
+                             const char *tmp)
+{
+	int ret;
+	char datafile[BTRFS_PATH_NAME_MAX+1];
+	ret = _scrub_datafile(fn_base, fn_local, tmp,
+	                      datafile, sizeof(datafile));
+	if (ret < 0)
+		return ret;
+	return _scrub_open_file(datafile, O_WRONLY|O_CREAT);
+}
+
+static int scrub_rename_file(const char *fn_base, const char *fn_local,
+                             const char *tmp)
+{
+	int ret;
+	char datafile_old[BTRFS_PATH_NAME_MAX+1];
+	char datafile_new[BTRFS_PATH_NAME_MAX+1];
+	ret = _scrub_datafile(fn_base, fn_local, tmp,
+	                      datafile_old, sizeof(datafile_old));
+	if (ret < 0)
+		return ret;
+	ret = _scrub_datafile(fn_base, fn_local, NULL,
+	                      datafile_new, sizeof(datafile_new));
+	if (ret < 0)
+		return ret;
+	ret = rename(datafile_old, datafile_new);
+	return ret ? -errno : 0;
+}
+
+#define _SCRUB_KVREAD(i, name, avail, l, dest) \
+	_scrub_kvread(i, sizeof(#name), avail, l, #name, dest.name)
+#define _SCRUB_KVREAD_STATS(i, name, avail, l, dest) \
+	_scrub_kvread(i, sizeof(#name), avail, l, #name, dest->stats.name)
+/*
+ * returns 0 if the key did not match (nothing was read)
+ *         1 if the key did match (success)
+ *        -1 if the key did match and an error occured
+ */
+static int _scrub_kvread(int *i, int len, int avail, const char *buf,
+                         const char *key, u64 *dest)
+{
+	int j;
+
+	if (*i+len+1 < avail && strncmp(&buf[*i], key, len-1) == 0) {
+		*i += len-1;
+		if (buf[*i] != ':') {
+			return -1;
+		}
+		*i += 1;
+		for (j=0; isdigit(buf[*i+j]) && *i+j < avail; ++j)
+			;
+		if (*i+j >= avail)
+			return -1;
+		*dest = atoll(&buf[*i]);
+		*i += j;
+		return 1;
+	}
+	
+	return 0;
+}
+
+#define _SCRUB_ILLEGAL do {						\
+	if (report_errors) {						\
+		fprintf(stderr, "WARNING: illegal data in line %d pos "	\
+		        "%d state %d (near \"%.*s\") at %s:%d\n",	\
+		        lineno, i, state, 20 > avail ? avail : 20, l+i,	\
+		        __FILE__, __LINE__);				\
+	}								\
+	goto skip;							\
+} while (0)
+static struct scrub_file_record **scrub_read_file(int fd, int report_errors)
+{
+	int avail = 0;
+	int old_avail = 0;
+	char l[512];
+	int state = 0;
+	int curr = -1;
+	int i = 0;
+	int j;
+	int ret;
+	int eof = 0;
+	int lineno = 0;
+	u64 version;
+	char empty_uuid[BTRFS_FSID_SIZE] = {0};
+	struct scrub_file_record **p = NULL;
+
+	if (fd < 0)
+		return ERR_PTR(-EINVAL);
+
+again:
+	old_avail = avail-i;
+	BUG_ON(old_avail < 0);
+	if (old_avail)
+		memmove(l, l+i, old_avail);
+	avail = read(fd, l+old_avail, sizeof(l)-old_avail);
+	if (avail == 0) {
+		eof = 1;
+	}
+	if (avail + old_avail == 0) {
+		if (curr >= 0 &&
+		    memcmp(p[curr]->fsid, empty_uuid, BTRFS_FSID_SIZE) == 0) {
+			p[curr] = NULL;
+		} else if (curr == -1) {
+			p = ERR_PTR(-ENODATA);
+		}
+		return p;
+	}
+	if (avail == -1)
+		return ERR_PTR(-errno);
+	avail += old_avail;
+
+	i = 0;
+	while (i < avail) {
+		switch (state) {
+		case 0: /* start if file */
+			ret = _scrub_kvread(&i,
+				sizeof(SCRUB_FILE_VERSION_PREFIX)-1, avail, l,
+				SCRUB_FILE_VERSION_PREFIX, &version);
+			if (ret != 1)
+				_SCRUB_ILLEGAL;
+			if (version != atoll(SCRUB_FILE_VERSION))
+				return ERR_PTR(-ENOTSUP);
+			state = 6;
+			continue;
+		case 1: /* start of line, alloc */
+			if (!eof && !memchr(l+i, '\n', avail-i))
+				goto again;
+			++lineno;
+			if (curr > -1 && memcmp(p[curr]->fsid, empty_uuid,
+			                        BTRFS_FSID_SIZE) == 0) {
+				state = 2;
+				continue;
+			}
+			++curr;
+			p = realloc(p, (curr+2)*sizeof(*p));
+			if (p)
+				p[curr] = malloc(sizeof(**p));
+			if (!p || !p[curr])
+				return ERR_PTR(-errno);
+			memset(p[curr], 0, sizeof(**p));
+			p[curr+1] = NULL;
+			++state;
+		case 2: /* start of line, skip space */
+			while (isspace(l[i]) && i<avail) {
+				if (l[i] == '\n')
+					++lineno;
+				++i;
+			}
+			if (i >= avail || (!eof && !memchr(l+i, '\n', avail-i)))
+				goto again;
+			++state;
+		case 3: /* read fsid */
+			if (i == avail)
+				continue;
+			for (j=0; l[i+j] != ':' && i+j < avail; ++j)
+				;
+			if (i+j+1 >= avail)
+				_SCRUB_ILLEGAL;
+			if (j != 36)
+				_SCRUB_ILLEGAL;
+			l[i+j] = '\0';
+			ret = uuid_parse(l+i, p[curr]->fsid);
+			if (ret)
+				_SCRUB_ILLEGAL;
+			i += j + 1;
+			++state;
+		case 4: /* read dev id */
+			for (j=0; isdigit(l[i+j]) && i+j < avail; ++j)
+				;
+			if (!j || i+j+1 >= avail)
+				_SCRUB_ILLEGAL;
+			p[curr]->devid = atoll(&l[i]);
+			i += j + 1;
+			++state;
+		case 5: /* read key/value pair */
+			ret = _SCRUB_KVREAD(&i, data_extents_scrubbed, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, data_extents_scrubbed, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, tree_extents_scrubbed, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, data_bytes_scrubbed, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, tree_bytes_scrubbed, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, read_errors, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, csum_errors, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, verify_errors, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, no_csum, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, csum_discards, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, super_errors, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, malloc_errors, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, uncorrectable_errors, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, corrected_errors, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, last_physical, avail,
+			                    l, &p[curr]->p) ||
+			      _SCRUB_KVREAD(&i, finished, avail,
+			                    l, &p[curr]->stats) ||
+			      _SCRUB_KVREAD(&i, t_start, avail,
+			                    l, (u64*)&p[curr]->stats) ||
+			      _SCRUB_KVREAD(&i, t_resumed, avail,
+			                    l, (u64*)&p[curr]->stats) ||
+			      _SCRUB_KVREAD(&i, duration, avail,
+			                    l, (u64*)&p[curr]->stats) ||
+			      _SCRUB_KVREAD(&i, canceled, avail,
+			                    l, &p[curr]->stats);
+			if (ret != 1)
+				_SCRUB_ILLEGAL;
+			++state;
+		case 6: /* after number */
+			if (l[i] == '|') {
+				state = 5;
+			} else if (l[i] == '\n') {
+				state = 1;
+			} else {
+				_SCRUB_ILLEGAL;
+			}
+			++i;
+			continue;
+		case 99: /* skip rest of line */
+skip:
+			state = 99;
+			do {
+				++i;
+				if (l[i-1] == '\n') {
+					state = 1;
+					break;
+				}
+			} while (i < avail);
+			continue;
+		}
+		BUG();
+	}
+	goto again;
+}
+#undef _SCRUB_ILLEGAL
+         
+static int _scrub_write_buf(int fd, const void *data, int len)
+{
+	int ret;
+	ret = write(fd, data, len);
+	return ret - len;
+}
+
+static int _scrub_writev(int fd, char *buf, int max, const char *fmt, ...)
+				__attribute__ ((format (printf, 4, 5)));
+static int _scrub_writev(int fd, char *buf, int max, const char *fmt, ...)
+{
+	int ret;
+	va_list args;
+	
+	va_start(args, fmt);
+	ret = vsnprintf(buf, max, fmt, args);
+	va_end(args);
+	if (ret >= max)
+		return ret - max;
+	return _scrub_write_buf(fd, buf, ret);
+}
+
+#define _SCRUB_SUM(dest, data, name) dest->scrub_args.progress.name =	\
+			data->resumed->p.name + data->scrub_args.progress.name
+static struct scrub_progress *_scrub_resumed_stats(struct scrub_progress *data,
+                                                   struct scrub_progress *dest)
+{
+	if (!data->resumed || data->skip)
+		return data;
+
+	_SCRUB_SUM(dest, data, data_extents_scrubbed);
+	_SCRUB_SUM(dest, data, tree_extents_scrubbed);
+	_SCRUB_SUM(dest, data, data_bytes_scrubbed);
+	_SCRUB_SUM(dest, data, tree_bytes_scrubbed);
+	_SCRUB_SUM(dest, data, read_errors);
+	_SCRUB_SUM(dest, data, csum_errors);
+	_SCRUB_SUM(dest, data, verify_errors);
+	_SCRUB_SUM(dest, data, no_csum);
+	_SCRUB_SUM(dest, data, csum_discards);
+	_SCRUB_SUM(dest, data, super_errors);
+	_SCRUB_SUM(dest, data, malloc_errors);
+	_SCRUB_SUM(dest, data, uncorrectable_errors);
+	_SCRUB_SUM(dest, data, corrected_errors);
+	_SCRUB_SUM(dest, data, last_physical);
+	dest->stats.canceled = data->stats.canceled;
+	dest->stats.finished = data->stats.finished;
+	dest->stats.t_resumed = data->stats.t_start;
+	dest->stats.t_start = data->resumed->stats.t_start;
+	dest->stats.duration = data->resumed->stats.duration +
+							data->stats.duration;
+	dest->scrub_args.devid = data->scrub_args.devid;
+	return dest;
+}
+
+#define _SCRUB_KVWRITE(fd, buf, name, use) 		\
+	_scrub_kvwrite(fd, buf, sizeof(buf), #name, 	\
+	               use->scrub_args.progress.name)
+#define _SCRUB_KVWRITE_STATS(fd, buf, name, use) 	\
+	_scrub_kvwrite(fd, buf, sizeof(buf), #name, 	\
+	               use->stats.name)
+static int _scrub_kvwrite(int fd, char *buf, int max,
+                          const char *key, u64 val)
+{
+	return _scrub_writev(fd, buf, max, "|%s:%lld", key, val);
+}
+
+static int scrub_write_file(int fd, const char *fsid,
+                            struct scrub_progress* data, int n)
+{
+	int ret = 0;
+	int i;
+	char buf[1024];
+	struct scrub_progress local;
+	struct scrub_progress *use;
+
+	if (n < 1) {
+		return -EINVAL;
+	}
+
+	ret = _scrub_write_buf(fd, SCRUB_FILE_VERSION_PREFIX SCRUB_FILE_VERSION
+	                       "\n", sizeof(SCRUB_FILE_VERSION_PREFIX)-1
+	                       + sizeof(SCRUB_FILE_VERSION)-1 + 1);
+	if (ret)
+		return -EOVERFLOW;
+
+	for (i=0; i<n; ++i) {
+		use = _scrub_resumed_stats(&data[i], &local);
+		if (_scrub_write_buf(fd, fsid, strlen(fsid)) ||
+		    _scrub_write_buf(fd, ":", 1) ||
+		    _scrub_writev(fd, buf, sizeof(buf), "%lld",
+		                  use->scrub_args.devid) ||
+		    _scrub_write_buf(fd, buf, ret) ||
+		    _SCRUB_KVWRITE(fd, buf, data_extents_scrubbed, use) ||
+		    _SCRUB_KVWRITE(fd, buf, tree_extents_scrubbed, use) ||
+		    _SCRUB_KVWRITE(fd, buf, data_bytes_scrubbed, use) ||
+		    _SCRUB_KVWRITE(fd, buf, tree_bytes_scrubbed, use) ||
+		    _SCRUB_KVWRITE(fd, buf, read_errors, use) ||
+		    _SCRUB_KVWRITE(fd, buf, csum_errors, use) ||
+		    _SCRUB_KVWRITE(fd, buf, verify_errors, use) ||
+		    _SCRUB_KVWRITE(fd, buf, no_csum, use) ||
+		    _SCRUB_KVWRITE(fd, buf, csum_discards, use) ||
+		    _SCRUB_KVWRITE(fd, buf, super_errors, use) ||
+		    _SCRUB_KVWRITE(fd, buf, malloc_errors, use) ||
+		    _SCRUB_KVWRITE(fd, buf, uncorrectable_errors, use) ||
+		    _SCRUB_KVWRITE(fd, buf, corrected_errors, use) ||
+		    _SCRUB_KVWRITE(fd, buf, last_physical, use) ||
+		    _SCRUB_KVWRITE_STATS(fd, buf, t_start, use) ||
+		    _SCRUB_KVWRITE_STATS(fd, buf, t_resumed, use) ||
+		    _SCRUB_KVWRITE_STATS(fd, buf, duration, use) ||
+		    _SCRUB_KVWRITE_STATS(fd, buf, canceled, use) ||
+		    _SCRUB_KVWRITE_STATS(fd, buf, finished, use) ||
+		    _scrub_write_buf(fd, "\n", 1)) {
+			return -EOVERFLOW;
+		}
+	}
+
+	return 0;
+}
+#undef _SCRUB_KVWRITE
+
+static int scrub_write_progress(pthread_mutex_t *m, const char *fsid,
+                                struct scrub_progress* data, int n)
+{
+	int ret;
+	int fd;
+	int old;
+
+	ret = pthread_mutex_lock(m);
+	if (ret) {
+		ret = -errno;
+		goto out;
+	}
+
+	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &old);
+
+	fd = scrub_open_file_w(SCRUB_DATA_FILE, fsid, "tmp");
+	if (fd < 0) {
+		ret = fd;
+		goto out;
+	}
+	ret = scrub_write_file(fd, fsid, data, n);
+	if (ret)
+		goto out;
+	ret = scrub_rename_file(SCRUB_DATA_FILE, fsid, "tmp");
+	if (ret)
+		goto out;
+	ret = close(fd);
+	if (ret) {
+		ret = -errno;
+		goto out;
+	}
+
+out:
+	if (ret) {
+		pthread_mutex_unlock(m);
+	} else {
+		ret = pthread_mutex_unlock(m);
+		if (ret)
+			ret = -errno;
+	}
+
+	pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &old);
+
+	return ret;
+}
+
+static void *scrub_one_dev(void *ctx)
+{
+	struct scrub_progress *sp = ctx;
+	int ret;
+	struct timeval tv;
+
+	sp->stats.canceled = 0;
+	sp->stats.duration = 0;
+	sp->stats.finished = 0;
+
+	ret = ioctl(sp->fd, BTRFS_IOC_SCRUB, &sp->scrub_args);
+	gettimeofday(&tv, NULL);
+	sp->ret = ret;
+	sp->stats.duration = tv.tv_sec - sp->stats.t_start;
+	sp->stats.canceled = !!ret;
+	sp->ioctl_errno = errno;
+	ret = pthread_mutex_lock(&sp->progress_mutex);
+	if (ret)
+		return ERR_PTR(-errno);
+	sp->stats.finished = 1;
+	ret = pthread_mutex_unlock(&sp->progress_mutex);
+	if (ret)
+		return ERR_PTR(-errno);
+	
+
+	return NULL;
+}
+
+static void *progress_one_dev(void *ctx)
+{
+	struct scrub_progress *sp = ctx;
+	
+	sp->ret = ioctl(sp->fd, BTRFS_IOC_SCRUB_PROGRESS, &sp->scrub_args);
+	sp->ioctl_errno = errno;
+
+	return NULL;
+}
+
+static void *scrub_progress_cycle(void *ctx)
+{
+	int ret;
+	int i;
+	char fsid[37];
+	struct scrub_progress *sp;
+	struct scrub_progress *sp_last;
+	struct scrub_progress *sp_shared;
+	struct timeval tv;
+	struct scrub_progress_cycle *spc = ctx;
+	int ndev = spc->fi->num_devices;
+	int this = 1;
+	int last = 0;
+	int peer_fd = -1;
+	struct pollfd accept_poll_fd = {
+		.fd = spc->prg_fd,
+		.events = POLLIN,
+		.revents = 0,
+	};
+	struct pollfd write_poll_fd = {
+		.events = POLLOUT,
+		.revents = 0,
+	};
+	struct sockaddr_un peer;
+	socklen_t peer_size = sizeof(peer);
+
+	pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &ret);
+	uuid_unparse(spc->fi->fsid, fsid);
+
+	for (i=0; i<ndev; ++i) {
+		sp = &spc->progress[i];
+		sp_last = &spc->progress[i+ndev];
+		sp_shared = &spc->shared_progress[i];
+		sp->scrub_args.devid = sp_last->scrub_args.devid =
+						sp_shared->scrub_args.devid;
+		sp->fd = sp_last->fd = spc->fdmnt;
+		sp->stats.t_start = sp_last->stats.t_start =
+						sp_shared->stats.t_start;
+		sp->resumed = sp_last->resumed = sp_shared->resumed;
+		sp->skip = sp_last->skip = sp_shared->skip;
+		sp->stats.finished = sp_last->stats.finished =
+						sp_shared->stats.finished;
+	}
+
+	while (1) {
+		ret = poll(&accept_poll_fd, 1, 5*1000);
+		if (ret == -1)
+			return ERR_PTR(-errno);
+		if (ret)
+			peer_fd = accept(spc->prg_fd, (struct sockaddr *)&peer,
+					 &peer_size);
+		gettimeofday(&tv, NULL);
+		this = (this+1)%2;
+		last = (last+1)%2;
+		for (i=0; i<ndev; ++i) {
+			sp = &spc->progress[this*ndev+i];
+			sp_last = &spc->progress[last*ndev+i];
+			sp_shared = &spc->shared_progress[i];
+			if (sp->stats.finished) {
+				continue;
+			}
+			progress_one_dev(sp);
+			sp->stats.duration = tv.tv_sec - sp->stats.t_start;
+			if (!sp->ret)
+				continue;
+			if (sp->ioctl_errno != ENOTCONN &&
+			    sp->ioctl_errno != ENODEV)
+				return ERR_PTR(-sp->ioctl_errno);
+			/*
+			 * scrub finished or device removed, check the
+			 * finished flag. if unset, just use the last
+			 * result we got for the current write and go
+			 * on. flag should be set on next cycle, then.
+			 */
+			ret = pthread_mutex_lock(&sp_shared->progress_mutex);
+			if (ret)
+				return ERR_PTR(-errno);
+			if (!sp_shared->stats.finished) {
+				ret = pthread_mutex_unlock(
+						&sp_shared->progress_mutex);
+				if (ret)
+					return ERR_PTR(-errno);
+				memcpy(sp, sp_last, sizeof(*sp));
+				continue;
+			}
+			ret = pthread_mutex_unlock(&sp_shared->progress_mutex);
+			if (ret)
+				return ERR_PTR(-errno);
+			memcpy(sp, sp_shared, sizeof(*sp));
+			memcpy(sp_last, sp_shared, sizeof(*sp));
+		}
+		if (peer_fd != -1) {
+			write_poll_fd.fd = peer_fd;
+			ret = poll(&write_poll_fd, 1, 0);
+			if (ret == -1)
+				return ERR_PTR(-errno);
+			if (ret) {
+				ret = scrub_write_file(
+					peer_fd, fsid,
+					&spc->progress[this*ndev], ndev);
+				if (ret)
+					return ERR_PTR(ret);
+			}
+			close(peer_fd);
+			peer_fd = -1;
+		}
+		if (!spc->do_record)
+			continue;
+		ret = scrub_write_progress(spc->write_mutex, fsid,
+		                           &spc->progress[this*ndev], ndev);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+}
+
+static struct scrub_file_record *last_dev_scrub(
+		struct scrub_file_record *const *const past_scrubs, u64 devid)
+{
+	int i;
+
+	if (!past_scrubs || IS_ERR(past_scrubs))
+		return NULL;
+
+	for (i=0; past_scrubs[i]; ++i)
+		if (past_scrubs[i]->devid == devid)
+			return past_scrubs[i];
+
+	return NULL;
+}
+
+static int scrub_device_info(int fd, u64 devid,
+			     struct btrfs_ioctl_dev_info_args *di_args)
+{
+	int ret;
+
+	di_args->devid = devid;
+	memset(&di_args->uuid, '\0', sizeof(di_args->uuid));
+
+	ret = ioctl(fd, BTRFS_IOC_DEV_INFO, di_args);
+	return ret ? -errno : 0;
+}
+
+static int scrub_fs_info(int fd, char *path,
+                         struct btrfs_ioctl_fs_info_args *fi_args,
+                         struct btrfs_ioctl_dev_info_args **di_ret)
+{
+	int ret = 0;
+	int ndevs = 0;
+	int i = 1;
+	struct btrfs_fs_devices* fs_devices_mnt = NULL;
+	struct btrfs_ioctl_dev_info_args *di_args;
+	char mp[BTRFS_PATH_NAME_MAX+1];
+
+	memset(fi_args, 0, sizeof(*fi_args));
+
+	ret = ioctl(fd, BTRFS_IOC_FS_INFO, fi_args);
+	if (ret && errno == EINVAL) {
+		/* path is no mounted btrfs. try if it's a device */
+		ret = check_mounted_where(fd, path, mp, sizeof(mp),
+		                          &fs_devices_mnt);
+		if (!ret)
+			return -EINVAL;
+		fi_args->num_devices = 1;
+		fi_args->max_id = fs_devices_mnt->latest_devid;
+		i = fs_devices_mnt->latest_devid;
+		memcpy(fi_args->fsid, fs_devices_mnt->fsid, BTRFS_FSID_SIZE);
+		close(fd);
+		fd = open_file_or_dir(mp);
+		if (fd < 0)
+			return -errno;
+	} else if (ret) {
+		return -errno;
+	}
+
+	if (!fi_args->num_devices)
+		return 0;
+
+	di_args = *di_ret = malloc(fi_args->num_devices*sizeof(*di_args));
+	if (!di_args)
+		return -errno;
+
+	for (; i<=fi_args->max_id; ++i) {
+		BUG_ON(ndevs >= fi_args->num_devices);
+		ret = scrub_device_info(fd, i, &di_args[ndevs]);
+		if (ret == -ENODEV)
+			continue;
+		if (ret)
+			return ret;
+		++ndevs;
+	}
+
+	BUG_ON(ndevs == 0);
+
+	return 0;
+}
+
+int mkdir_p(char *path)
+{
+	int i;
+	int ret;
+
+	for (i=1; i<strlen(path); ++i) {
+		if (path[i] != '/')
+			continue;
+		path[i] = '\0';
+		ret = mkdir(path, 0777);
+		if (ret && errno != EEXIST)
+			return 1;
+		path[i] = '/';
+	}
+
+	return 0;
+}
+
+static int scrub_start(int argc, char **argv, int resume)
+{
+	int fdmnt;
+	int prg_fd = -1;
+	int fdres = -1;
+	int ret;
+	pid_t pid;
+	int c;
+	int i;
+	int err = 0;
+	int print_raw = 0;
+	char *path;
+	int do_background = 1;
+	int do_wait = 0;
+	int do_print = 0;
+	int do_quiet = 0;
+	int do_record = 1;
+	int readonly = 0;
+	int do_stats_per_dev = 0;
+	int n_start = 0;
+	int n_skip = 0;
+	int n_resume = 0;
+	struct btrfs_ioctl_fs_info_args fi_args;
+	struct btrfs_ioctl_dev_info_args *di_args = NULL;
+	struct scrub_progress *sp = NULL;
+	struct scrub_fs_stat fs_stat;
+	struct timeval tv;
+	struct sockaddr_un addr = {
+		.sun_family = AF_UNIX,
+	};
+	pthread_t *t_devs = NULL;
+	pthread_t t_prog;
+	pthread_attr_t t_attr;
+	struct scrub_file_record **past_scrubs = NULL;
+	struct scrub_file_record *last_scrub = NULL;
+	char *datafile = strdup(SCRUB_DATA_FILE);
+	char fsid[37];
+	char sock_path[BTRFS_PATH_NAME_MAX+1] = "";
+	struct scrub_progress_cycle spc;
+	pthread_mutex_t spc_write_mutex = PTHREAD_MUTEX_INITIALIZER;
+	void *terr;
+	u64 devid;
+
+	optind = 1;
+	while ((c = getopt(argc, argv, "BdqrR")) != -1) {
+		switch(c) {
+		case 'B':
+			do_background = 0;
+			do_wait = 1;
+			do_print = 1;
+			break;
+		case 'd':
+			do_stats_per_dev = 1;
+			break;
+		case 'q':
+			do_quiet = 1;
+			break;
+		case 'r':
+			readonly = 1;
+			break;
+		case 'R':
+			print_raw = 1;
+			break;
+		case '?':
+		default:
+			fprintf(stderr, "ERROR: scrub args invalid.\n"
+			                " -B  do not background (implies -W)\n"
+			                " -d  stats per device (-B only)\n"
+			                " -q  quiet\n"
+			                " -r  read only mode\n");
+			return 1;
+		}
+	}
+
+	/* try to catch most error cases before forking */
+
+	spc.progress = NULL;
+	if (do_quiet && do_print)
+		do_print = 0;
+
+	if (mkdir_p(datafile)) {
+		err(!do_quiet, "WARNING: cannot create scrub data "
+			       "file, mkdir %s failed: %s. Status recording "
+			       "disabled\n", datafile, strerror(errno));
+		do_record = 0;
+	}
+
+	path = argv[optind];
+
+	fdmnt = open_file_or_dir(path);
+	if (fdmnt < 0) {
+		err(!do_quiet, "ERROR: can't access '%s'\n", path);
+		return 12;
+	}
+
+	ret = scrub_fs_info(fdmnt, path, &fi_args, &di_args);
+	if (ret) {
+		err(!do_quiet, "ERROR: getting dev info for scrub failed: "
+		    "%s\n", strerror(-ret));
+		err = 1;
+		goto out;
+	}
+	if (!fi_args.num_devices) {
+		err(!do_quiet, "ERROR: no devices found\n");
+		err = 1;
+		goto out;
+	}
+
+	uuid_unparse(fi_args.fsid, fsid);
+	fdres = scrub_open_file_r(SCRUB_DATA_FILE, fsid);
+	if (fdres < 0 && fdres != -ENOENT) {
+		err(!do_quiet, "WARNING: failed to open status file: "
+		    "%s\n", strerror(-fdres));
+	} else if (fdres >= 0) {
+		past_scrubs = scrub_read_file(fdres, !do_quiet);
+		if (IS_ERR(past_scrubs))
+			err(!do_quiet, "WARNING: failed to read status file: "
+			    "%s\n", strerror(-PTR_ERR(past_scrubs)));
+		close(fdres);
+	}
+
+	t_devs = malloc(fi_args.num_devices*sizeof(*t_devs));
+	sp = calloc(1, fi_args.num_devices*sizeof(*sp));
+	spc.progress = calloc(1, fi_args.num_devices*2*sizeof(*spc.progress));
+
+	if (!t_devs || !sp || !spc.progress) {
+		err(!do_quiet, "ERROR: scrub failed: %s", strerror(errno));
+		err = 1;
+		goto out;
+	}
+
+	ret = pthread_attr_init(&t_attr);
+	if (ret) {
+		err(!do_quiet, "ERROR: pthread_attr_init failed: %s\n",
+		    strerror(ret));
+		err = 1;
+		goto out;
+	}
+
+	for (i = 0; i < fi_args.num_devices; ++i) {
+		devid = di_args[i].devid;
+		ret = pthread_mutex_init(&sp[i].progress_mutex, NULL);
+		if (ret) {
+			err(!do_quiet, "ERROR: pthread_mutex_init failed: "
+			    "%s\n", strerror(ret));
+			err = 1;
+			goto out;
+		}
+		last_scrub = last_dev_scrub(past_scrubs, devid);
+		sp[i].scrub_args.devid = devid;
+		sp[i].fd = fdmnt;
+		if (resume && last_scrub && (last_scrub->stats.canceled ||
+		                             !last_scrub->stats.finished)) {
+			++n_resume;
+			sp[i].scrub_args.start = last_scrub->p.last_physical;
+			sp[i].resumed = last_scrub;
+		} else if (resume) {
+			++n_skip;
+			sp[i].skip = 1;
+			sp[i].resumed = last_scrub;
+			continue;
+		} else {
+			++n_start;
+			sp[i].scrub_args.start = 0ll;
+			sp[i].resumed = NULL;
+		}
+		sp[i].skip = 0;
+		sp[i].scrub_args.end = (u64)-1ll;
+		sp[i].scrub_args.flags = readonly ? BTRFS_SCRUB_READONLY : 0;
+	}
+
+	if (!n_start && !n_resume) {
+		if (!do_quiet)
+			printf("scrub: nothing to resume for %s, fsid %s\n",
+			       path, fsid);
+		err = 0;
+		goto out;
+	}
+
+	ret = prg_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+	while (ret != -1) {
+		_scrub_datafile(SCRUB_PROGRESS_SOCKET_PATH, fsid,
+				NULL, sock_path, sizeof(sock_path));
+		/* ignore EOVERFLOW, as strncpy follows anyway */
+		strncpy(addr.sun_path, sock_path,
+			sizeof(addr.sun_path)-1);
+		ret = bind(prg_fd, (struct sockaddr *)&addr, sizeof(addr));
+		if (ret != -1 || errno != EADDRINUSE)
+			break;
+		ret = connect(prg_fd, (struct sockaddr *)&addr, sizeof(addr));
+		if (!ret || errno != ECONNREFUSED) {
+			fprintf(stderr, "ERROR: scrub already running\n");
+			close(prg_fd);
+			goto out;
+		}
+		ret = unlink(sock_path);
+	}
+	if (ret != -1) {
+		ret = listen(prg_fd, 100);
+	}
+	if (ret == -1) {
+		err(!do_quiet, "WARNING: failed to open the progress status "
+		    "socket at %s: %s. Progress cannot be queried\n",
+		    sock_path[0] ? sock_path : SCRUB_PROGRESS_SOCKET_PATH,
+		    strerror(errno));
+		if (prg_fd != -1) {
+			close(prg_fd);
+			prg_fd = -1;
+			if (sock_path[0])
+				unlink(sock_path);
+		}
+	}
+
+	if (do_record) {
+		/* write all-zero progress file for a start */
+		ret = scrub_write_progress(&spc_write_mutex, fsid, sp,
+					   fi_args.num_devices);
+		if (ret) {
+			err(!do_quiet, "WARNING: failed to write the progress "
+			    "status file: %s. Status recording disabled\n",
+			    strerror(-ret));
+			do_record = 0;
+		}
+	}
+
+	if (do_background) {
+		pid = fork();
+		if (pid == -1) {
+			err(!do_quiet, "ERROR: cannot scrub, fork failed: "
+			               "%s\n", strerror(errno));
+			err = 1;
+			goto out;
+		}
+
+		if (pid) {
+			int stat;
+			scrub_handle_sigint_parent();
+			if (!do_quiet)
+				printf("scrub %s on %s, fsid %s (pid=%d)\n",
+				       n_start ? "started" : "resumed",
+				       path, fsid, pid);
+			if (!do_wait) {
+				err = 0;
+				goto out;
+			}
+			ret = wait(&stat);
+			if (ret != pid) {
+				err(!do_quiet, "ERROR: wait failed: (ret=%d) "
+				    "%s\n", ret, strerror(errno));
+				err = 1;
+				goto out;
+			}
+			if (!WIFEXITED(stat) || WEXITSTATUS(stat)) {
+				err(!do_quiet, "ERROR: scrub process failed\n");
+				err = WIFEXITED(stat) ? WEXITSTATUS(stat) : -1;
+				goto out;
+			}
+			err = 0;
+			goto out;
+		}
+	}
+
+	scrub_handle_sigint_child(fdmnt);
+
+	for (i = 0; i < fi_args.num_devices; ++i) {
+		if (sp[i].skip) {
+			sp[i].scrub_args.progress = sp[i].resumed->p;
+			sp[i].stats = sp[i].resumed->stats;
+			sp[i].ret = 0;
+			sp[i].stats.finished = 1;
+			continue;
+		}
+		devid = di_args[i].devid;
+		gettimeofday(&tv, NULL);
+		sp[i].stats.t_start = tv.tv_sec;
+		ret = pthread_create(&t_devs[i], &t_attr, scrub_one_dev,&sp[i]);
+		if (ret) {
+			if (do_print)
+				fprintf(stderr, "ERROR: creating "
+				        "scrub_one_dev[%llu] thread failed: "
+				        "%s\n", devid, strerror(ret));
+			err = 1;
+			goto out;
+		}
+	}
+
+	spc.fdmnt = fdmnt;
+	spc.prg_fd = prg_fd;
+	spc.do_record = do_record;
+	spc.write_mutex = &spc_write_mutex;
+	spc.shared_progress = sp;
+	spc.fi = &fi_args;
+	pthread_create(&t_prog, &t_attr, scrub_progress_cycle, &spc);
+
+	err = 0;
+	for (i = 0; i < fi_args.num_devices; ++i) {
+		if (sp[i].skip)
+			continue;
+		devid = di_args[i].devid;
+		ret = pthread_join(t_devs[i], NULL);
+		if (ret) {
+			if (do_print)
+				fprintf(stderr, "ERROR: pthread_join failed "
+				        "for scrub_one_dev[%llu]: %s\n", devid,
+			                strerror(ret));
+			err++;
+			continue;
+		}
+		if (sp[i].ret && sp[i].ioctl_errno == ENODEV) {
+			if (do_print)
+				fprintf(stderr, "WARNING: device %lld not "
+				        "present\n", devid);
+			continue;
+		}
+		if (sp[i].ret && sp[i].ioctl_errno == ECANCELED) {
+			err++;
+		} else if (sp[i].ret) {
+			if (do_print)
+				fprintf(stderr, "ERROR: scrubbing %s failed "
+				        "for device id %lld (%s)\n", path,
+				        devid, strerror(sp[i].ioctl_errno));
+			err++;
+			continue;
+		}
+	}
+
+	if (do_print) {
+		const char *append = "done";
+		if (!do_stats_per_dev)
+			init_fs_stat(&fs_stat);
+		for (i = 0; i < fi_args.num_devices; ++i) {
+			if (do_stats_per_dev) {
+				print_scrub_dev(&di_args[i],
+				                &sp[i].scrub_args.progress,
+				                print_raw,
+				                sp[i].ret ? "canceled" : "done",
+				                &sp[i].stats);
+			} else {
+				if (sp[i].ret)
+					append = "canceled";
+				add_to_fs_stat(&sp[i].scrub_args.progress,
+						&sp[i].stats, &fs_stat);
+			}
+		}
+		if (!do_stats_per_dev) {
+			printf("scrub %s for %s\n", append, fsid);
+			print_fs_stat(&fs_stat, print_raw);
+		}
+	}
+
+	pthread_cancel(t_prog);
+	ret = pthread_join(t_prog, &terr);
+	if (do_print && terr && terr != PTHREAD_CANCELED) {
+		fprintf(stderr, "ERROR: recording progress "
+			"failed: %s\n", strerror(-PTR_ERR(terr)));
+	}
+
+	if (do_record) {
+		ret = scrub_write_progress(&spc_write_mutex, fsid, sp,
+					   fi_args.num_devices);
+		if (ret && do_print) {
+			fprintf(stderr, "ERROR: failed to record the result: "
+				"%s\n", strerror(-ret));
+		}
+	}
+
+	scrub_handle_sigint_child(-1);
+
+out:
+	free_history(past_scrubs);
+	free(di_args);
+	free(t_devs);
+	free(sp);
+	free(spc.progress);
+	if (prg_fd > -1) {
+		close(prg_fd);
+		if (sock_path[0])
+			unlink(sock_path);
+	}
+	close(fdmnt);
+
+	return !!err;
+}
+
+int do_scrub_start(int argc, char **argv)
+{
+	return scrub_start(argc, argv, 0);
+}
+
+int do_scrub_resume(int argc, char **argv)
+{
+	return scrub_start(argc, argv, 1);
+}
+
+int do_scrub_cancel(int argc, char **argv)
+{
+	char *path = argv[1];
+	int ret;
+	int fdmnt;
+	int err;
+	char mp[BTRFS_PATH_NAME_MAX+1];
+	struct btrfs_fs_devices* fs_devices_mnt = NULL;
+
+	fdmnt = open_file_or_dir(path);
+	if (fdmnt < 0) {
+		fprintf(stderr, "ERROR: scrub cancel failed\n");
+		return 12;
+	}
+
+again:
+	ret = ioctl(fdmnt, BTRFS_IOC_SCRUB_CANCEL, NULL);
+	err = errno;
+	close(fdmnt);
+
+	if (ret && err == EINVAL) {
+		/* path is no mounted btrfs. try if it's a device */
+		ret = check_mounted_where(fdmnt, path, mp, sizeof(mp),
+					  &fs_devices_mnt);
+		close(fdmnt);
+		if (ret) {
+			fdmnt = open_file_or_dir(mp);
+			if (fdmnt >= 0) {
+				path = mp;
+				goto again;
+			}
+		}
+	}
+
+	if (ret) {
+		fprintf(stderr, "ERROR: scrub cancel failed on %s: %s\n", path,
+		        err == ENOTCONN ? "not running" : strerror(errno));
+		return 1;
+	}
+
+	printf("scrub cancelled\n");
+
+	return 0;
+}
+
+int do_scrub_status(int argc, char **argv)
+{
+
+	char *path;
+	struct btrfs_ioctl_fs_info_args fi_args;
+	struct btrfs_ioctl_dev_info_args *di_args = NULL;
+	struct scrub_file_record **past_scrubs = NULL;
+	struct scrub_file_record *last_scrub;
+	struct scrub_fs_stat fs_stat;
+	struct sockaddr_un addr = {
+		.sun_family = AF_UNIX,
+	};
+	int ret;
+	int fdmnt;
+	int i;
+	optind = 1;
+	int print_raw = 0;
+	int do_stats_per_dev = 0;
+	char c;
+	char fsid[37];
+	int fdres = -1;
+	int err = 0;
+
+	while ((c = getopt(argc, argv, "dR")) != -1) {
+		switch(c) {
+		case 'd':
+			do_stats_per_dev = 1;
+			break;
+		case 'R':
+			print_raw = 1;
+			break;
+		case '?':
+		default:
+			fprintf(stderr, "ERROR: scrub status args invalid.\n"
+			                " -d  stats per device\n");
+			return 1;
+		}
+	}
+
+	path = argv[optind];
+
+	fdmnt = open_file_or_dir(path);
+	if (fdmnt < 0) {
+		fprintf(stderr, "ERROR: can't access to '%s'\n", path);
+		return 12;
+	}
+
+	ret = scrub_fs_info(fdmnt, path, &fi_args, &di_args);
+	if (ret) {
+		fprintf(stderr, "ERROR: getting dev info for scrub failed: "
+		        "%s\n", strerror(-ret));
+		err = 1;
+		goto out;
+	}
+	if (!fi_args.num_devices) {
+		fprintf(stderr, "ERROR: no devices found\n");
+		err = 1;
+		goto out;
+	}
+
+	uuid_unparse(fi_args.fsid, fsid);
+
+	fdres = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (fdres == -1) {
+		fprintf(stderr, "ERROR: failed to create socket to "
+			"receive progress information: %s\n",
+			strerror(errno));
+		err = 1;
+		goto out;
+	}
+	_scrub_datafile(SCRUB_PROGRESS_SOCKET_PATH, fsid,
+			NULL, addr.sun_path, sizeof(addr.sun_path)-1);
+	/* ignore EOVERFLOW, just use shorter name and hope for the best */
+	ret = connect(fdres, (struct sockaddr *)&addr, sizeof(addr));
+	if (ret == -1) {
+		fdres = scrub_open_file_r(SCRUB_DATA_FILE, fsid);
+		if (fdres < 0 && fdres != -ENOENT) {
+			fprintf(stderr, "WARNING: failed to open status file: "
+				"%s\n", strerror(-fdres));
+			err = 1;
+			goto out;
+		}
+	}
+
+	if (fdres >= 0) {
+		past_scrubs = scrub_read_file(fdres, 1);
+		if (IS_ERR(past_scrubs))
+			fprintf(stderr, "WARNING: failed to read status: %s\n",
+				strerror(-PTR_ERR(past_scrubs)));
+	}
+
+	printf("scrub status for %s\n", fsid);
+
+	/*
+	 * TODO: rather communicate with scrub process instead of
+	 *       dumping the file stats for instant results
+	 */
+	if (do_stats_per_dev) {
+		for (i = 0; i < fi_args.num_devices; ++i) {
+			last_scrub = last_dev_scrub(past_scrubs,
+			                            di_args[i].devid);
+			if (!last_scrub) {
+				print_scrub_dev(&di_args[i], NULL, print_raw,
+				                NULL, NULL);
+				continue;
+			}
+			print_scrub_dev(&di_args[i], &last_scrub->p, print_raw,
+				        last_scrub->stats.finished ?
+			                "history" : "status",
+			                &last_scrub->stats);
+		}
+	} else {
+		init_fs_stat(&fs_stat);
+		for (i = 0; i < fi_args.num_devices; ++i) {
+			last_scrub = last_dev_scrub(past_scrubs,
+			                            di_args[i].devid);
+			if (!last_scrub)
+				continue;
+			add_to_fs_stat(&last_scrub->p, &last_scrub->stats,
+			               &fs_stat);
+		}
+		print_fs_stat(&fs_stat, print_raw);
+	}
+
+out:
+	free_history(past_scrubs);
+	free(di_args);
+	close(fdmnt);
+	if (fdres > -1)
+		close(fdres);
+
+	return err;
+}