@@ -105,58 +105,71 @@ struct userfaultfd_unmap_ctx {
struct list_head list;
};
+struct userfaultfd_wake_info {
+ __u64 mode;
+ struct kiocb *iocb_callback;
+ struct iov_iter from;
+ unsigned long start;
+ unsigned long len;
+ bool copied;
+};
+
struct userfaultfd_wait_queue {
struct uffd_msg msg;
wait_queue_entry_t wq;
struct userfaultfd_ctx *ctx;
+ struct userfaultfd_wake_info wake_info;
bool waken;
};
-struct userfaultfd_wake_range {
- unsigned long start;
- unsigned long len;
-};
+
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
- struct userfaultfd_wake_range *range = key;
- int ret;
+ struct userfaultfd_wake_info *wake_info = key;
struct userfaultfd_wait_queue *uwq;
unsigned long start, len;
+ int ret = 0;
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
- ret = 0;
/* len == 0 means wake all */
- start = range->start;
- len = range->len;
+ start = wake_info->start;
+ len = wake_info->len;
if (len && (start > uwq->msg.arg.pagefault.address ||
start + len <= uwq->msg.arg.pagefault.address))
goto out;
- smp_store_mb(uwq->waken, true);
+ uwq->wake_info = *wake_info;
+
+ if (wake_info->iocb_callback)
+ wake_info->copied = true;
+
+ /* Ensure uwq->wake_info is visible to handle_userfault() before waken */
+ smp_wmb();
+
+ WRITE_ONCE(uwq->waken, true);
/*
* The Program-Order guarantees provided by the scheduler
* ensure uwq->waken is visible before the task is woken.
*/
ret = wake_up_state(wq->private, mode);
- if (ret) {
- /*
- * Wake only once, autoremove behavior.
- *
- * After the effect of list_del_init is visible to the other
- * CPUs, the waitqueue may disappear from under us, see the
- * !list_empty_careful() in handle_userfault().
- *
- * try_to_wake_up() has an implicit smp_mb(), and the
- * wq->private is read before calling the extern function
- * "wake_up_state" (which in turns calls try_to_wake_up).
- */
- list_del_init(&wq->entry);
- }
+
+ /*
+ * Wake only once, autoremove behavior.
+ *
+ * After the effect of list_del_init is visible to the other
+ * CPUs, the waitqueue may disappear from under us, see the
+ * !list_empty_careful() in handle_userfault().
+ *
+ * try_to_wake_up() has an implicit smp_mb(), and the
+ * wq->private is read before calling the extern function
+ * "wake_up_state" (which in turns calls try_to_wake_up).
+ */
+ list_del_init(&wq->entry);
out:
- return ret;
+ return ret || wake_info->copied;
}
/**
@@ -384,6 +397,9 @@ static bool userfaultfd_get_async_complete_locked(struct userfaultfd_ctx *ctx,
return true;
}
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_info *wake_info);
+
static bool userfaultfd_get_async_complete(struct userfaultfd_ctx *ctx,
struct kiocb **iocb, struct iov_iter *iter)
{
@@ -414,6 +430,43 @@ static void userfaultfd_copy_async_msg(struct kiocb *iocb,
iter->kvec = NULL;
}
+static void userfaultfd_complete_write(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wait_queue *uwq)
+{
+ struct kiocb *iocb = uwq->wake_info.iocb_callback;
+ const struct kvec *kvec = uwq->wake_info.from.kvec;
+ bool zeropage = uwq->wake_info.mode & UFFDIO_WRITE_MODE_ZEROPAGE;
+ u64 mode = uwq->wake_info.mode &
+ (UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP);
+ int r;
+
+ if (zeropage)
+ r = mfill_zeropage(ctx->mm, uwq->wake_info.start,
+ &uwq->wake_info.from, &ctx->mmap_changing);
+ else
+ r = mcopy_atomic(ctx->mm, uwq->wake_info.start,
+ &uwq->wake_info.from, &ctx->mmap_changing, mode);
+
+ /*
+ * If we failed, do not wake the others, but if there was a partial
+ * write, still wake others.
+ */
+ if (r < 0)
+ goto out;
+
+ /* The callees should not do any copying */
+ uwq->wake_info.iocb_callback = NULL;
+ uwq->wake_info.from.kvec = NULL;
+ wake_userfault(ctx, &uwq->wake_info);
+out:
+ /*
+ * Complete the operation only after waking the other threads as done
+ * in the synchronous case.
+ */
+ iocb->ki_complete(iocb, r, 0);
+ kfree(kvec);
+}
+
/*
* The locking rules involved in returning VM_FAULT_RETRY depending on
* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
@@ -548,6 +601,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
+ uwq.wake_info.iocb_callback = NULL;
blocking_state = userfaultfd_get_blocking_state(vmf->flags);
@@ -569,7 +623,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*/
spin_lock(&wqh->lock);
- __add_wait_queue(wqh, &uwq.wq);
+ /* Exclusive on the fault_wqh, not on the fault_pending_wqh */
+ if (async)
+ __add_wait_queue_exclusive(wqh, &uwq.wq);
+ else
+ __add_wait_queue(wqh, &uwq.wq);
/* Ensure it is queued before userspace is informed. */
smp_wmb();
@@ -612,6 +670,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
cpu_relax();
cond_resched();
}
+ /*
+ * Ensure writes from userfaultfd_wake_function into uwq
+ * are visible.
+ */
+ smp_rmb();
} else
schedule();
}
@@ -650,6 +713,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
local_irq_enable();
}
+ /* Complete copy/zero after the entry is no longer on the queue. */
+ if (uwq.wake_info.iocb_callback)
+ userfaultfd_complete_write(ctx, &uwq);
+
/*
* ctx may go away after this if the userfault pseudo fd is
* already released.
@@ -1004,7 +1071,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *vma, *prev;
/* len == 0 means wake all */
- struct userfaultfd_wake_range range = { .len = 0, };
+ struct userfaultfd_wake_info wake_info = { 0 };
unsigned long new_flags;
WRITE_ONCE(ctx->released, true);
@@ -1052,8 +1119,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* the fault_*wqh.
*/
spin_lock_irq(&ctx->fault_pending_wqh.lock);
- __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
- __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &wake_info);
+ __wake_up(&ctx->fault_wqh, TASK_NORMAL, 0, &wake_info);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
userfaultfd_cancel_async_reads(ctx);
@@ -1294,7 +1361,7 @@ static ssize_t userfaultfd_ctx_read(struct kiocb *iocb,
* anyway.
*/
list_del(&uwq->wq.entry);
- add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+ add_wait_queue_exclusive(&ctx->fault_wqh, &uwq->wq);
write_seqcount_end(&ctx->refile_seq);
@@ -1459,20 +1526,20 @@ static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
static void __wake_userfault(struct userfaultfd_ctx *ctx,
- struct userfaultfd_wake_range *range)
+ struct userfaultfd_wake_info *wake_info)
{
spin_lock_irq(&ctx->fault_pending_wqh.lock);
/* wake all in the range and autoremove */
if (waitqueue_active(&ctx->fault_pending_wqh))
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
- range);
+ wake_info);
if (waitqueue_active(&ctx->fault_wqh))
- __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
+ __wake_up(&ctx->fault_wqh, TASK_NORMAL, 0, wake_info);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
}
static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
- struct userfaultfd_wake_range *range)
+ struct userfaultfd_wake_info *wake_info)
{
unsigned seq;
bool need_wakeup;
@@ -1499,7 +1566,7 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
cond_resched();
} while (read_seqcount_retry(&ctx->refile_seq, seq));
if (need_wakeup)
- __wake_userfault(ctx, range);
+ __wake_userfault(ctx, wake_info);
}
static __always_inline int validate_range(struct mm_struct *mm,
@@ -1524,14 +1591,57 @@ static __always_inline int validate_range(struct mm_struct *mm,
return 0;
}
+static int userfaultfd_remote_mcopy(struct kiocb *iocb, __u64 dst,
+ struct iov_iter *from, __u64 mode)
+{
+ struct file *file = iocb->ki_filp;
+ struct userfaultfd_ctx *ctx = file->private_data;
+ struct userfaultfd_wake_info wake_info = {
+ .iocb_callback = iocb,
+ .mode = mode,
+ .start = dst,
+ .len = iov_iter_count(from),
+ .copied = false,
+ };
+ int ret = -EAGAIN;
+
+ if (mode & UFFDIO_COPY_MODE_DONTWAKE)
+ goto out;
+
+ if (!iov_iter_is_bvec(from) && !iov_iter_is_kvec(from))
+ goto out;
+
+ /*
+ * Check without a lock. If we are mistaken, the mcopy would be
+ * performed locally.
+ */
+ if (!waitqueue_active(&ctx->fault_wqh))
+ goto out;
+
+ dup_iter(&wake_info.from, from, GFP_KERNEL);
+
+ /* wake one in the range and autoremove */
+ __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &wake_info);
+
+ if (!wake_info.copied) {
+ kfree(wake_info.from.kvec);
+ goto out;
+ }
+
+ ret = -EIOCBQUEUED;
+out:
+ return ret;
+}
+
ssize_t userfaultfd_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct userfaultfd_wake_range range;
+ struct userfaultfd_wake_info wake_info = { 0 };
struct userfaultfd_ctx *ctx = file->private_data;
size_t len = iov_iter_count(from);
__u64 dst = iocb->ki_pos & PAGE_MASK;
unsigned long mode = iocb->ki_pos & ~PAGE_MASK;
+ int no_wait = file->f_flags & O_NONBLOCK;
bool zeropage;
__s64 ret;
@@ -1563,25 +1673,30 @@ ssize_t userfaultfd_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret)
goto out;
- if (mmget_not_zero(ctx->mm)) {
+ if (!mmget_not_zero(ctx->mm))
+ return -ESRCH;
+
+ ret = -EAGAIN;
+ if (no_wait && !is_sync_kiocb(iocb))
+ ret = userfaultfd_remote_mcopy(iocb, dst, from, mode);
+ if (ret == -EAGAIN) {
if (zeropage)
ret = mfill_zeropage(ctx->mm, dst, from,
&ctx->mmap_changing);
else
ret = mcopy_atomic(ctx->mm, dst, from,
&ctx->mmap_changing, mode);
- mmput(ctx->mm);
- } else {
- return -ESRCH;
}
+ mmput(ctx->mm);
+
if (ret < 0)
goto out;
/* len == 0 would wake all */
- range.len = ret;
+ wake_info.len = ret;
if (!(mode & UFFDIO_COPY_MODE_DONTWAKE)) {
- range.start = dst;
- wake_userfault(ctx, &range);
+ wake_info.start = dst;
+ wake_userfault(ctx, &wake_info);
}
out:
return ret;
@@ -1916,7 +2031,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* permanently and it avoids userland to call
* UFFDIO_WAKE explicitly.
*/
- struct userfaultfd_wake_range range;
+ struct userfaultfd_wake_info range;
range.start = start;
range.len = vma_end - start;
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
@@ -1971,7 +2086,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
{
int ret;
struct uffdio_range uffdio_wake;
- struct userfaultfd_wake_range range;
+ struct userfaultfd_wake_info wake_info = { 0 };
const void __user *buf = (void __user *)arg;
ret = -EFAULT;
@@ -1982,16 +2097,16 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
if (ret)
goto out;
- range.start = uffdio_wake.start;
- range.len = uffdio_wake.len;
+ wake_info.start = uffdio_wake.start;
+ wake_info.len = uffdio_wake.len;
/*
* len == 0 means wake all and we don't want to wake all here,
* so check it again to be sure.
*/
- VM_BUG_ON(!range.len);
+ VM_BUG_ON(!wake_info.len);
- wake_userfault(ctx, &range);
+ wake_userfault(ctx, &wake_info);
ret = 0;
out:
@@ -2004,7 +2119,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
__s64 ret;
struct uffdio_copy uffdio_copy;
struct uffdio_copy __user *user_uffdio_copy;
- struct userfaultfd_wake_range range;
+ struct userfaultfd_wake_info wake_info = { 0 };
struct iov_iter iter;
struct iovec iov;
@@ -2052,12 +2167,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
BUG_ON(!ret);
/* len == 0 would wake all */
- range.len = ret;
+ wake_info.len = ret;
if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
- range.start = uffdio_copy.dst;
- wake_userfault(ctx, &range);
+ wake_info.start = uffdio_copy.dst;
+ wake_userfault(ctx, &wake_info);
}
- ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+ ret = wake_info.len == uffdio_copy.len ? 0 : -EAGAIN;
out:
return ret;
}
@@ -2068,7 +2183,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
__s64 ret;
struct uffdio_zeropage uffdio_zeropage;
struct uffdio_zeropage __user *user_uffdio_zeropage;
- struct userfaultfd_wake_range range;
+ struct userfaultfd_wake_info wake_info = { 0 };
struct iov_iter iter;
struct iovec iov;
@@ -2108,12 +2223,12 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
/* len == 0 would wake all */
BUG_ON(!ret);
- range.len = ret;
+ wake_info.len = ret;
if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
- range.start = uffdio_zeropage.range.start;
- wake_userfault(ctx, &range);
+ wake_info.start = uffdio_zeropage.range.start;
+ wake_userfault(ctx, &wake_info);
}
- ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+ ret = wake_info.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
out:
return ret;
}
@@ -2124,7 +2239,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
int ret;
struct uffdio_writeprotect uffdio_wp;
struct uffdio_writeprotect __user *user_uffdio_wp;
- struct userfaultfd_wake_range range;
+ struct userfaultfd_wake_info wake_info = { 0 };
bool mode_wp, mode_dontwake;
if (READ_ONCE(ctx->mmap_changing))
@@ -2158,9 +2273,9 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return ret;
if (!mode_wp && !mode_dontwake) {
- range.start = uffdio_wp.range.start;
- range.len = uffdio_wp.range.len;
- wake_userfault(ctx, &range);
+ wake_info.start = uffdio_wp.range.start;
+ wake_info.len = uffdio_wp.range.len;
+ wake_userfault(ctx, &wake_info);
}
return ret;
}