Message ID | 20180522113108.25713-9-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, May 22, 2018 at 01:30:45PM +0200, Christoph Hellwig wrote: > +static inline void __aio_poll_complete(struct poll_iocb *req, __poll_t mask) > +{ > + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); > + > + fput(req->file); > + aio_complete(iocb, mangle_poll(mask), 0); > +} Careful. > +static int aio_poll_cancel(struct kiocb *iocb) > +{ > + struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); > + struct poll_iocb *req = &aiocb->poll; > + struct wait_queue_head *head = req->head; > + bool found = false; > + > + spin_lock(&head->lock); > + found = __aio_poll_remove(req); > + spin_unlock(&head->lock); What's to guarantee that req->head has not been freed by that point? Look: wakeup finds ->ctx_lock held, so it leaves the sucker on the list, removes it from queue and schedules the call of __aio_poll_complete(). Which gets executed just as we hit aio_poll_cancel(), starting with fput(). You really want to do aio_complete() before fput(). That way you know that req->wait is alive and well at least until iocb gets removed from the list. > + req->events = demangle_poll(iocb->aio_buf) | POLLERR | POLLHUP; EPOLLERR | EPOLLHUP, please. The values are equal to POLLERR and POLLHUP on all architectures, but let's avoid misannotations. > + spin_lock_irq(&ctx->ctx_lock); > + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); > + > + spin_lock(&req->head->lock); > + mask = req->file->f_op->poll_mask(req->file, req->events); > + if (!mask) > + __add_wait_queue(req->head, &req->wait); ITYM if (!mask) { __add_wait_queue(req->head, &req->wait); list_add_tail(&aiocb->ki_list, &ctx->active_reqs); } What's the point of exposing it to aio_poll_cancel() when it has never been on waitqueue?
On Tue, May 22, 2018 at 11:05:24PM +0100, Al Viro wrote: > > +{ > > + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); > > + > > + fput(req->file); > > + aio_complete(iocb, mangle_poll(mask), 0); > > +} > > Careful. > > > +static int aio_poll_cancel(struct kiocb *iocb) > > +{ > > + struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); > > + struct poll_iocb *req = &aiocb->poll; > > + struct wait_queue_head *head = req->head; > > + bool found = false; > > + > > + spin_lock(&head->lock); > > + found = __aio_poll_remove(req); > > + spin_unlock(&head->lock); > > What's to guarantee that req->head has not been freed by that point? > Look: wakeup finds ->ctx_lock held, so it leaves the sucker on the > list, removes it from queue and schedules the call of __aio_poll_complete(). > Which gets executed just as we hit aio_poll_cancel(), starting with fput(). > > You really want to do aio_complete() before fput(). That way you know that > req->wait is alive and well at least until iocb gets removed from the list. Oh, bugger... wakeup removed from queue schedule __aio_poll_complete() cancel grab ctx->lock remove from list work aio_complete() check if it's in the list it isn't, move on to free the sucker cancel call ->ki_cancel() BOOM Looks like we want to call ->ki_cancel() *BEFORE* removing from the list, as well as doing fput() after aio_complete(). The same ordering, BTW, goes for aio_read() et.al. Look: CPU1: io_cancel() grabs ->ctx_lock, finds iocb and removes it from the list. CPU2: aio_rw_complete() on that iocb. Since the sucker is not in the list anymore, we do NOT spin on ->ctx_lock and proceed to free iocb CPU1: pass freed iocb to ->ki_cancel(). BOOM. and if we have fput() done first (in aio_rw_complete()) we are vulnerable to CPU1: io_cancel() grabs ->ctx_lock, finds iocb and removes it from the list. CPU2: aio_rw_complete() on that iocb. fput() done, opening us to rmmod. CPU1: call ->ki_cancel(), which points to freed memory now. BOOM.
On Wed, May 23, 2018 at 01:45:30AM +0100, Al Viro wrote: > Oh, bugger... > > wakeup > removed from queue > schedule __aio_poll_complete() > > cancel > grab ctx->lock > remove from list > work > aio_complete() > check if it's in the list > it isn't, move on to free the sucker > cancel > call ->ki_cancel() > BOOM > > Looks like we want to call ->ki_cancel() *BEFORE* removing from the list, > as well as doing fput() after aio_complete(). The same ordering, BTW, goes > for aio_read() et.al. > > Look: > CPU1: io_cancel() grabs ->ctx_lock, finds iocb and removes it from the list. > CPU2: aio_rw_complete() on that iocb. Since the sucker is not in the list > anymore, we do NOT spin on ->ctx_lock and proceed to free iocb > CPU1: pass freed iocb to ->ki_cancel(). BOOM. BTW, it seems that the mainline is vulnerable to this one. I might be missing something, but...
diff --git a/fs/aio.c b/fs/aio.c index 8991baa38d5d..4d1eabce6659 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -5,6 +5,7 @@ * Implements an efficient asynchronous io interface. * * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. + * Copyright 2018 Christoph Hellwig. * * See ../COPYING for licensing terms. */ @@ -164,10 +165,22 @@ struct fsync_iocb { bool datasync; }; +struct poll_iocb { + struct file *file; + __poll_t events; + struct wait_queue_head *head; + + union { + struct wait_queue_entry wait; + struct work_struct work; + }; +}; + struct aio_kiocb { union { struct kiocb rw; struct fsync_iocb fsync; + struct poll_iocb poll; }; struct kioctx *ki_ctx; @@ -1558,7 +1571,6 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) return -EINVAL; - req->file = fget(iocb->aio_fildes); if (unlikely(!req->file)) return -EBADF; @@ -1573,6 +1585,124 @@ static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) return -EIOCBQUEUED; } +/* need to use list_del_init so we can check if item was present */ +static inline bool __aio_poll_remove(struct poll_iocb *req) +{ + if (list_empty(&req->wait.entry)) + return false; + list_del_init(&req->wait.entry); + return true; +} + +static inline void __aio_poll_complete(struct poll_iocb *req, __poll_t mask) +{ + struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); + + fput(req->file); + aio_complete(iocb, mangle_poll(mask), 0); +} + +static void aio_poll_work(struct work_struct *work) +{ + struct poll_iocb *req = container_of(work, struct poll_iocb, work); + + __aio_poll_complete(req, req->events); +} + +static int aio_poll_cancel(struct kiocb *iocb) +{ + struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); + struct poll_iocb *req = &aiocb->poll; + struct wait_queue_head *head = req->head; + bool found = false; + + spin_lock(&head->lock); + found = __aio_poll_remove(req); + spin_unlock(&head->lock); + + if (found) { + req->events = 0; + INIT_WORK(&req->work, aio_poll_work); + schedule_work(&req->work); + } + return 0; +} + +static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); + struct file *file = req->file; + __poll_t mask = key_to_poll(key); + + assert_spin_locked(&req->head->lock); + + /* for instances that support it check for an event match first: */ + if (mask && !(mask & req->events)) + return 0; + + mask = file->f_op->poll_mask(file, req->events); + if (!mask) + return 0; + + __aio_poll_remove(req); + + req->events = mask; + INIT_WORK(&req->work, aio_poll_work); + schedule_work(&req->work); + return 1; +} + +static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) +{ + struct kioctx *ctx = aiocb->ki_ctx; + struct poll_iocb *req = &aiocb->poll; + __poll_t mask; + + /* reject any unknown events outside the normal event mask. */ + if ((u16)iocb->aio_buf != iocb->aio_buf) + return -EINVAL; + /* reject fields that are not defined for poll */ + if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags) + return -EINVAL; + + req->events = demangle_poll(iocb->aio_buf) | POLLERR | POLLHUP; + req->file = fget(iocb->aio_fildes); + if (unlikely(!req->file)) + return -EBADF; + if (!file_has_poll_mask(req->file)) + goto out_fail; + + req->head = req->file->f_op->get_poll_head(req->file, req->events); + if (!req->head) + goto out_fail; + if (IS_ERR(req->head)) { + mask = EPOLLERR; + goto done; + } + + init_waitqueue_func_entry(&req->wait, aio_poll_wake); + aiocb->ki_cancel = aio_poll_cancel; + + spin_lock_irq(&ctx->ctx_lock); + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + + spin_lock(&req->head->lock); + mask = req->file->f_op->poll_mask(req->file, req->events); + if (!mask) + __add_wait_queue(req->head, &req->wait); + spin_unlock(&req->head->lock); + + spin_unlock_irq(&ctx->ctx_lock); +done: + if (mask) + __aio_poll_complete(req, mask); + return -EIOCBQUEUED; +out_fail: + fput(req->file); + return -EINVAL; /* same as no support for IOCB_CMD_POLL */ +} + static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb, bool compat) { @@ -1641,6 +1771,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, break; case IOCB_CMD_FDSYNC: ret = aio_fsync(&req->fsync, iocb, true); + case IOCB_CMD_POLL: + ret = aio_poll(req, iocb); break; default: pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index 2c0a3415beee..ed0185945bb2 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -39,10 +39,8 @@ enum { IOCB_CMD_PWRITE = 1, IOCB_CMD_FSYNC = 2, IOCB_CMD_FDSYNC = 3, - /* These two are experimental. - * IOCB_CMD_PREADX = 4, - * IOCB_CMD_POLL = 5, - */ + /* 4 was the experimental IOCB_CMD_PREADX */ + IOCB_CMD_POLL = 5, IOCB_CMD_NOOP = 6, IOCB_CMD_PREADV = 7, IOCB_CMD_PWRITEV = 8,