diff mbox

[RFC] big fat transaction ioctl

Message ID Pine.LNX.4.64.0911101143120.31818@cobra.newdream.net (mailing list archive)
State Under Review, archived
Headers show

Commit Message

Sage Weil Nov. 10, 2009, 8:12 p.m. UTC
None
diff mbox

Patch

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 136c5ed..4269616 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -37,6 +37,7 @@ 
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
 #include "compat.h"
@@ -1303,6 +1304,190 @@  long btrfs_ioctl_trans_end(struct file *file)
 	return 0;
 }
 
+/*
+ * return number of successfully complete ops via @ops_completed
+ * (where success/failure is defined by the _FAIL_* flags).
+ */
+static long do_usertrans(struct btrfs_root *root,
+			 struct btrfs_ioctl_usertrans *ut,
+			 u64 *ops_completed)
+{
+	int i;
+	int *fds;
+	int err;
+	struct file *file;
+	struct btrfs_ioctl_usertrans_op *ops = (void *)ut->ops_ptr;
+	int fd1, fd2;
+
+	fds = kcalloc(sizeof(int), ut->num_fds, GFP_KERNEL);
+	if (!fds)
+		return -ENOMEM;
+
+	for (i = 0; i < ut->num_ops; i++) {
+		struct btrfs_ioctl_usertrans_op op;
+		int ret;
+
+		err = -EFAULT;
+		if (copy_from_user(&op, &ops[i], sizeof(op)))
+			goto out;
+
+		/* lookup fd args? */
+		err = -EINVAL;
+		switch (op.op) {
+		case BTRFS_IOC_UT_OP_CLONERANGE:
+			if (op.args[1] < 0 || op.args[1] >= ut->num_fds)
+				goto out;
+			fd2 = fds[1];
+
+		case BTRFS_IOC_UT_OP_CLOSE:
+		case BTRFS_IOC_UT_OP_PWRITE:
+			if (op.args[0] < 0 || op.args[0] >= ut->num_fds)
+				goto out;
+			fd1 = fds[0];
+		}
+
+		/* do op */
+		switch (op.op) {
+		case BTRFS_IOC_UT_OP_OPEN:
+			ret = -EINVAL;
+			if (op.args[3] < 0 || op.args[3] >= ut->num_fds)
+				goto out;
+			ret = sys_open((const char __user *)op.args[0],
+				       op.args[1], op.args[2]);
+			fds[op.args[3]] = ret;
+			break;
+		case BTRFS_IOC_UT_OP_CLOSE:
+			ret = sys_close(fd1);
+			break;
+		case BTRFS_IOC_UT_OP_PWRITE:
+			ret = sys_pwrite64(fd1, (const char __user *)op.args[1],
+					   op.args[2], op.args[3]);
+			break;
+		case BTRFS_IOC_UT_OP_UNLINK:
+			ret = sys_unlink((const char __user *)op.args[0]);
+			break;
+		case BTRFS_IOC_UT_OP_MKDIR:
+			ret = sys_mkdir((const char __user *)op.args[0],
+				op.args[1]);
+			break;
+		case BTRFS_IOC_UT_OP_RMDIR:
+			ret = sys_rmdir((const char __user *)op.args[0]);
+			break;
+		case BTRFS_IOC_UT_OP_TRUNCATE:
+			ret = sys_truncate((const char __user *)op.args[0],
+					   op.args[1]);
+			break;
+		case BTRFS_IOC_UT_OP_SETXATTR:
+			ret = sys_setxattr((char __user *)op.args[0],
+					   (char __user *)op.args[1],
+					   (void __user *)op.args[2],
+					   op.args[3], op.args[4]);
+			break;
+		case BTRFS_IOC_UT_OP_REMOVEXATTR:
+			ret = sys_removexattr((char __user *)op.args[0],
+					      (char __user *)op.args[1]);
+			break;
+		case BTRFS_IOC_UT_OP_CLONERANGE:
+			ret = -EBADF;
+			file = fget(fd1);
+			if (file) {
+				ret = btrfs_ioctl_clone(file, fd2,
+							op.args[2], op.args[3],
+							op.args[4]);
+				fput(file);
+			}
+			break;
+		}
+		pr_debug(" ut %d/%d op %d args %llx %llx %llx %llx %llx = %d\n",
+			 i, (int)ut->num_ops, (int)op.op, op.args[0],
+			 op.args[1], op.args[2], op.args[3], op.args[4], ret);
+
+		put_user(ret, &ops[i].rval);
+
+		if ((op.flags & BTRFS_IOC_UT_OP_FLAG_FAIL_ON_NE) &&
+		    ret != op.rval)
+			goto out;
+		if ((op.flags & BTRFS_IOC_UT_OP_FLAG_FAIL_ON_EQ) &&
+		    ret == op.rval)
+			goto out;
+		if ((op.flags & BTRFS_IOC_UT_OP_FLAG_FAIL_ON_LT) &&
+		    ret < op.rval)
+			goto out;
+		if ((op.flags & BTRFS_IOC_UT_OP_FLAG_FAIL_ON_GT) &&
+		    ret > op.rval)
+			goto out;
+		if ((op.flags & BTRFS_IOC_UT_OP_FLAG_FAIL_ON_LTE) &&
+		    ret <= op.rval)
+			goto out;
+		if ((op.flags & BTRFS_IOC_UT_OP_FLAG_FAIL_ON_GTE) &&
+		    ret >= op.rval)
+			goto out;
+	}
+	err = 0;
+out:
+	*ops_completed = i;
+	kfree(fds);
+	return err;
+}
+
+long btrfs_ioctl_usertrans(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_ioctl_usertrans ut, *orig_ut = arg;
+	u64 ops_completed = 0;
+	int ret;
+
+	ret = -EPERM;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(&ut, orig_ut, sizeof(ut)))
+		goto out;
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		goto out;
+
+	ret = btrfs_reserve_metadata_space(root, 5*ut.num_ops);
+	if (ret)
+		goto out_drop_write;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->open_ioctl_trans++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	ret = -ENOMEM;
+	trans = btrfs_start_ioctl_transaction(root, 0);
+	if (!trans)
+		goto out_drop;
+
+	ret = do_usertrans(root, &ut, &ops_completed);
+	put_user(ops_completed, &orig_ut->ops_completed);
+
+	if (ret < 0 && (ut.flags & BTRFS_IOC_UT_FLAG_WEDGEONFAIL))
+		pr_err("btrfs: usertrans failed, wedging to avoid partial "
+		       " commit\n");
+	else
+		btrfs_end_transaction(trans, root);
+
+out_drop:
+	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->open_ioctl_trans--;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	btrfs_unreserve_metadata_space(root, 5*ut.num_ops);
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
+out:
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -1343,6 +1528,8 @@  long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
+	case BTRFS_IOC_USERTRANS:
+		return btrfs_ioctl_usertrans(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914..f94e293 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -67,4 +67,53 @@  struct btrfs_ioctl_clone_range_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
 				struct btrfs_ioctl_vol_args)
+
+/* usertrans ops */
+/* the 'fd' values are _indices_ into a temporary fd table, see num_fds below */
+#define BTRFS_IOC_UT_OP_OPEN         1  /* path, flags, mode, fd */
+#define BTRFS_IOC_UT_OP_CLOSE        2  /* fd */
+#define BTRFS_IOC_UT_OP_PWRITE       3  /* fd, data, length, offset */
+#define BTRFS_IOC_UT_OP_UNLINK       4  /* path */
+#define BTRFS_IOC_UT_OP_LINK         5  /* oldpath, newpath */
+#define BTRFS_IOC_UT_OP_MKDIR        6  /* path, mode */
+#define BTRFS_IOC_UT_OP_RMDIR        7  /* path */
+#define BTRFS_IOC_UT_OP_TRUNCATE     8  /* path, size */
+#define BTRFS_IOC_UT_OP_SETXATTR     9  /* path, name, data, len */
+#define BTRFS_IOC_UT_OP_REMOVEXATTR 10  /* path, name */
+#define BTRFS_IOC_UT_OP_CLONERANGE  11  /* dst fd, src fd, off, len, dst off */
+
+/* define what 'failure' entails for each op based on return value */
+#define BTRFS_IOC_UT_OP_FLAG_FAIL_ON_NE    (1<< 1)
+#define BTRFS_IOC_UT_OP_FLAG_FAIL_ON_EQ    (1<< 2)
+#define BTRFS_IOC_UT_OP_FLAG_FAIL_ON_LT    (1<< 3)
+#define BTRFS_IOC_UT_OP_FLAG_FAIL_ON_GT    (1<< 4)
+#define BTRFS_IOC_UT_OP_FLAG_FAIL_ON_LTE   (1<< 5)
+#define BTRFS_IOC_UT_OP_FLAG_FAIL_ON_GTE   (1<< 6)
+
+struct btrfs_ioctl_usertrans_op {
+	__u64 op;
+	__s64 args[5];
+	__s64 rval;
+	__u64 flags;
+};
+
+/*
+ * If an op fails and we cannot complete the transaction, we may want
+ * to lock up the file system (requiring a reboot) to prevent a
+ * partial result from committing.
+ */
+#define BTRFS_IOC_UT_FLAG_WEDGEONFAIL (1<<13)
+
+struct btrfs_ioctl_usertrans {
+	__u64 num_ops;                  /* in: # ops */
+	__u64 ops_ptr;                  /* in: usertrans_op array */
+	__u64 num_fds;	                /* in: size of fd table (max fd + 1) */
+	__u64 data_bytes, metadata_ops; /* in: for space reservation */
+	__u64 flags;                    /* in: flags */
+	__u64 ops_completed;            /* out: # ops completed */
+};
+
+#define BTRFS_IOC_USERTRANS  _IOW(BTRFS_IOCTL_MAGIC, 16,	\
+				  struct btrfs_ioctl_usertrans)
+
 #endif
diff --git a/fs/namei.c b/fs/namei.c
index d11f404..4d53225 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2148,6 +2148,7 @@  SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 {
 	return sys_mkdirat(AT_FDCWD, pathname, mode);
 }
+EXPORT_SYMBOL(sys_mkdir);
 
 /*
  * We try to drop the dentry early: we should have
@@ -2262,6 +2263,7 @@  SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 {
 	return do_rmdir(AT_FDCWD, pathname);
 }
+EXPORT_SYMBOL(sys_rmdir);
 
 int vfs_unlink(struct inode *dir, struct dentry *dentry)
 {
@@ -2369,6 +2371,7 @@  SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 {
 	return do_unlinkat(AT_FDCWD, pathname);
 }
+EXPORT_SYMBOL(sys_unlink);
 
 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 {
diff --git a/fs/open.c b/fs/open.c
index 4f01e06..15eddfc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -294,6 +294,7 @@  SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
 {
 	return do_sys_truncate(path, length);
 }
+EXPORT_SYMBOL(sys_truncate);
 
 static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 {
@@ -1062,6 +1063,7 @@  SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
 	asmlinkage_protect(3, ret, filename, flags, mode);
 	return ret;
 }
+EXPORT_SYMBOL(sys_open);
 
 SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
 		int, mode)
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ac2898..75e9f60 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -453,6 +453,8 @@  SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
 
 	return ret;
 }
+EXPORT_SYMBOL(sys_pwrite64);
+
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
 {
diff --git a/fs/xattr.c b/fs/xattr.c
index 6d4f6d3..488c889 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -294,6 +294,7 @@  SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
 	path_put(&path);
 	return error;
 }
+EXPORT_SYMBOL(sys_setxattr);
 
 SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
 		const char __user *, name, const void __user *, value,
@@ -523,6 +524,7 @@  SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
 	path_put(&path);
 	return error;
 }
+EXPORT_SYMBOL(sys_removexattr);
 
 SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
 		const char __user *, name)