Message ID | YzN+ZYLjK6HI1P1C@ZenIV (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [CFT,coredump] don't use __kernel_write() on kmap_local_page() | expand |
Al Viro <viro@zeniv.linux.org.uk> writes: > [I'm going to send a pull request tomorrow if nobody yells; > please review and test - it seems to work fine here, but extra > eyes and extra testing would be very welcome] > > passing kmap_local_page() result to __kernel_write() is unsafe - > random ->write_iter() might (and 9p one does) get unhappy when > passed ITER_KVEC with pointer that came from kmap_local_page(). > > Fix by providing a variant of __kernel_write() that takes an iov_iter > from caller (__kernel_write() becomes a trivial wrapper) and adding > dump_emit_page() that parallels dump_emit(), except that instead of > __kernel_write() it uses __kernel_write_iter() with ITER_BVEC source. Acked-by: "Eric W. Biederman" <ebiederm@xmission.com> From a 10,000 foot view this makes sense. > Fixes: 3159ed57792b "fs/coredump: use kmap_local_page()" > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> > --- > diff --git a/fs/coredump.c b/fs/coredump.c > index 9f4aae202109..09dbc981ad5c 100644 > --- a/fs/coredump.c > +++ b/fs/coredump.c > @@ -832,6 +832,38 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) > } > } > > +static int dump_emit_page(struct coredump_params *cprm, struct page *page) > +{ > + struct bio_vec bvec = { > + .bv_page = page, > + .bv_offset = 0, > + .bv_len = PAGE_SIZE, > + }; > + struct iov_iter iter; > + struct file *file = cprm->file; > + loff_t pos = file->f_pos; > + ssize_t n; > + > + if (cprm->to_skip) { > + if (!__dump_skip(cprm, cprm->to_skip)) > + return 0; > + cprm->to_skip = 0; > + } > + if (cprm->written + PAGE_SIZE > cprm->limit) > + return 0; > + if (dump_interrupted()) > + return 0; > + iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE); > + n = __kernel_write_iter(cprm->file, &iter, &pos); > + if (n != PAGE_SIZE) > + return 0; > + file->f_pos = pos; > + cprm->written += PAGE_SIZE; > + cprm->pos += PAGE_SIZE; > + > + return 1; > +} > + > int dump_emit(struct coredump_params *cprm, const void *addr, int nr) > { > if (cprm->to_skip) { > @@ -863,7 +895,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, > > for (addr = start; addr < start + len; addr += PAGE_SIZE) { > struct page *page; > - int stop; > > /* > * To avoid having to allocate page tables for virtual address > @@ -874,12 +905,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, > */ > page = get_dump_page(addr); > if (page) { > - void *kaddr = kmap_local_page(page); > - > - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); > - kunmap_local(kaddr); > - put_page(page); > - if (stop) > + if (!dump_emit_page(cprm, page)) > return 0; > } else { > dump_skip(cprm, PAGE_SIZE); > diff --git a/fs/internal.h b/fs/internal.h > index 87e96b9024ce..3e206d3e317c 100644 > --- a/fs/internal.h > +++ b/fs/internal.h > @@ -16,6 +16,7 @@ struct shrink_control; > struct fs_context; > struct user_namespace; > struct pipe_inode_info; > +struct iov_iter; > > /* > * block/bdev.c > @@ -221,3 +222,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns, > int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); > int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, > struct xattr_ctx *ctx); > + > +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); > diff --git a/fs/read_write.c b/fs/read_write.c > index 1a261dcf1778..328ce8cf9a85 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t > } > > /* caller is responsible for file_start_write/file_end_write */ > -ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) > +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) > { > - struct kvec iov = { > - .iov_base = (void *)buf, > - .iov_len = min_t(size_t, count, MAX_RW_COUNT), > - }; > struct kiocb kiocb; > - struct iov_iter iter; > ssize_t ret; > > if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) > @@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t > > init_sync_kiocb(&kiocb, file); > kiocb.ki_pos = pos ? *pos : 0; > - iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); > - ret = file->f_op->write_iter(&kiocb, &iter); > + ret = file->f_op->write_iter(&kiocb, from); > if (ret > 0) { > if (pos) > *pos = kiocb.ki_pos; > @@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t > inc_syscw(current); > return ret; > } > + > +/* caller is responsible for file_start_write/file_end_write */ > +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) > +{ > + struct kvec iov = { > + .iov_base = (void *)buf, > + .iov_len = min_t(size_t, count, MAX_RW_COUNT), > + }; > + struct iov_iter iter; > + iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); > + return __kernel_write_iter(file, &iter, pos); > +} > /* > * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", > * but autofs is one of the few internal kernel users that actually
On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote: > [I'm going to send a pull request tomorrow if nobody yells; > please review and test - it seems to work fine here, but extra > eyes and extra testing would be very welcome] ... and now with page leak fixed: commit 06bbaa6dc53cb72040db952053432541acb9adc7 Author: Al Viro <viro@zeniv.linux.org.uk> Date: Mon Sep 26 11:59:14 2022 -0400 [coredump] don't use __kernel_write() on kmap_local_page() passing kmap_local_page() result to __kernel_write() is unsafe - random ->write_iter() might (and 9p one does) get unhappy when passed ITER_KVEC with pointer that came from kmap_local_page(). Fix by providing a variant of __kernel_write() that takes an iov_iter from caller (__kernel_write() becomes a trivial wrapper) and adding dump_emit_page() that parallels dump_emit(), except that instead of __kernel_write() it uses __kernel_write_iter() with ITER_BVEC source. Fixes: 3159ed57792b "fs/coredump: use kmap_local_page()" Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> diff --git a/fs/coredump.c b/fs/coredump.c index 9f4aae202109..1ab4f5b76a1e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -832,6 +832,38 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) } } +static int dump_emit_page(struct coredump_params *cprm, struct page *page) +{ + struct bio_vec bvec = { + .bv_page = page, + .bv_offset = 0, + .bv_len = PAGE_SIZE, + }; + struct iov_iter iter; + struct file *file = cprm->file; + loff_t pos = file->f_pos; + ssize_t n; + + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + if (cprm->written + PAGE_SIZE > cprm->limit) + return 0; + if (dump_interrupted()) + return 0; + iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE); + n = __kernel_write_iter(cprm->file, &iter, &pos); + if (n != PAGE_SIZE) + return 0; + file->f_pos = pos; + cprm->written += PAGE_SIZE; + cprm->pos += PAGE_SIZE; + + return 1; +} + int dump_emit(struct coredump_params *cprm, const void *addr, int nr) { if (cprm->to_skip) { @@ -863,7 +895,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; - int stop; /* * To avoid having to allocate page tables for virtual address @@ -874,10 +905,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - void *kaddr = kmap_local_page(page); - - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap_local(kaddr); + int stop = !dump_emit_page(cprm, page); put_page(page); if (stop) return 0; diff --git a/fs/internal.h b/fs/internal.h index 87e96b9024ce..3e206d3e317c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -16,6 +16,7 @@ struct shrink_control; struct fs_context; struct user_namespace; struct pipe_inode_info; +struct iov_iter; /* * block/bdev.c @@ -221,3 +222,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns, int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct xattr_ctx *ctx); + +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); diff --git a/fs/read_write.c b/fs/read_write.c index 1a261dcf1778..328ce8cf9a85 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t } /* caller is responsible for file_start_write/file_end_write */ -ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { - struct kvec iov = { - .iov_base = (void *)buf, - .iov_len = min_t(size_t, count, MAX_RW_COUNT), - }; struct kiocb kiocb; - struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) @@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; - iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); - ret = file->f_op->write_iter(&kiocb, &iter); + ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; @@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t inc_syscw(current); return ret; } + +/* caller is responsible for file_start_write/file_end_write */ +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +{ + struct kvec iov = { + .iov_base = (void *)buf, + .iov_len = min_t(size_t, count, MAX_RW_COUNT), + }; + struct iov_iter iter; + iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); + return __kernel_write_iter(file, &iter, pos); +} /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually
On Wed, Sep 28, 2022 at 07:29:43PM +0100, Al Viro wrote: > On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote: > > [I'm going to send a pull request tomorrow if nobody yells; > > please review and test - it seems to work fine here, but extra > > eyes and extra testing would be very welcome] > > ... and now with page leak fixed: > > commit 06bbaa6dc53cb72040db952053432541acb9adc7 > Author: Al Viro <viro@zeniv.linux.org.uk> > Date: Mon Sep 26 11:59:14 2022 -0400 > > [coredump] don't use __kernel_write() on kmap_local_page() > > passing kmap_local_page() result to __kernel_write() is unsafe - > random ->write_iter() might (and 9p one does) get unhappy when > passed ITER_KVEC with pointer that came from kmap_local_page(). > > Fix by providing a variant of __kernel_write() that takes an iov_iter > from caller (__kernel_write() becomes a trivial wrapper) and adding > dump_emit_page() that parallels dump_emit(), except that instead of > __kernel_write() it uses __kernel_write_iter() with ITER_BVEC source. > > Fixes: 3159ed57792b "fs/coredump: use kmap_local_page()" > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Reviewed-by: Ira Weiny <ira.weiny@intel.com> Tested-by: Ira Weiny <ira.weiny@intel.com> > > diff --git a/fs/coredump.c b/fs/coredump.c > index 9f4aae202109..1ab4f5b76a1e 100644 > --- a/fs/coredump.c > +++ b/fs/coredump.c > @@ -832,6 +832,38 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) > } > } > > +static int dump_emit_page(struct coredump_params *cprm, struct page *page) > +{ > + struct bio_vec bvec = { > + .bv_page = page, > + .bv_offset = 0, > + .bv_len = PAGE_SIZE, > + }; > + struct iov_iter iter; > + struct file *file = cprm->file; > + loff_t pos = file->f_pos; > + ssize_t n; > + > + if (cprm->to_skip) { > + if (!__dump_skip(cprm, cprm->to_skip)) > + return 0; > + cprm->to_skip = 0; > + } > + if (cprm->written + PAGE_SIZE > cprm->limit) > + return 0; > + if (dump_interrupted()) > + return 0; > + iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE); > + n = __kernel_write_iter(cprm->file, &iter, &pos); > + if (n != PAGE_SIZE) > + return 0; > + file->f_pos = pos; > + cprm->written += PAGE_SIZE; > + cprm->pos += PAGE_SIZE; > + > + return 1; > +} > + > int dump_emit(struct coredump_params *cprm, const void *addr, int nr) > { > if (cprm->to_skip) { > @@ -863,7 +895,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, > > for (addr = start; addr < start + len; addr += PAGE_SIZE) { > struct page *page; > - int stop; > > /* > * To avoid having to allocate page tables for virtual address > @@ -874,10 +905,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, > */ > page = get_dump_page(addr); > if (page) { > - void *kaddr = kmap_local_page(page); > - > - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); > - kunmap_local(kaddr); > + int stop = !dump_emit_page(cprm, page); > put_page(page); > if (stop) > return 0; > diff --git a/fs/internal.h b/fs/internal.h > index 87e96b9024ce..3e206d3e317c 100644 > --- a/fs/internal.h > +++ b/fs/internal.h > @@ -16,6 +16,7 @@ struct shrink_control; > struct fs_context; > struct user_namespace; > struct pipe_inode_info; > +struct iov_iter; > > /* > * block/bdev.c > @@ -221,3 +222,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns, > int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); > int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, > struct xattr_ctx *ctx); > + > +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); > diff --git a/fs/read_write.c b/fs/read_write.c > index 1a261dcf1778..328ce8cf9a85 100644 > --- a/fs/read_write.c > +++ b/fs/read_write.c > @@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t > } > > /* caller is responsible for file_start_write/file_end_write */ > -ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) > +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) > { > - struct kvec iov = { > - .iov_base = (void *)buf, > - .iov_len = min_t(size_t, count, MAX_RW_COUNT), > - }; > struct kiocb kiocb; > - struct iov_iter iter; > ssize_t ret; > > if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) > @@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t > > init_sync_kiocb(&kiocb, file); > kiocb.ki_pos = pos ? *pos : 0; > - iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); > - ret = file->f_op->write_iter(&kiocb, &iter); > + ret = file->f_op->write_iter(&kiocb, from); > if (ret > 0) { > if (pos) > *pos = kiocb.ki_pos; > @@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t > inc_syscw(current); > return ret; > } > + > +/* caller is responsible for file_start_write/file_end_write */ > +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) > +{ > + struct kvec iov = { > + .iov_base = (void *)buf, > + .iov_len = min_t(size_t, count, MAX_RW_COUNT), > + }; > + struct iov_iter iter; > + iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); > + return __kernel_write_iter(file, &iter, pos); > +} > /* > * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", > * but autofs is one of the few internal kernel users that actually >
Ira Weiny: > On Wed, Sep 28, 2022 at 07:29:43PM +0100, Al Viro wrote: > > On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote: > > > [I'm going to send a pull request tomorrow if nobody yells; > > > please review and test - it seems to work fine here, but extra > > > eyes and extra testing would be very welcome] I tried gdb backtrace 'bt' command with the new core by v6.0, and it doesn't show the call trace correctly. Is it related to this commit? test program ---------------------------------------- void f(int n) { printf("%d\n", n); if (!n) kill(getpid(), SIGQUIT); f(--n); } int main(int argc, char *argv[]) { f(atoi(argv[1])); return 0; } ---------------------------------------- ulimit -c unlimited coredump 2 gdb coredump core bt ---------------------------------------- expected result kill f f f main ---------------------------------------- actual result ?? ?? ---------------------------------------- J. R. Okajima
On Mon, Oct 03, 2022 at 07:48:28PM +0900, J. R. Okajima wrote: > Ira Weiny: > > On Wed, Sep 28, 2022 at 07:29:43PM +0100, Al Viro wrote: > > > On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote: > > > > [I'm going to send a pull request tomorrow if nobody yells; > > > > please review and test - it seems to work fine here, but extra > > > > eyes and extra testing would be very welcome] > > I tried gdb backtrace 'bt' command with the new core by v6.0, and it > doesn't show the call trace correctly. Is it related to this commit? > Are you also getting something like this? BFD: warning: /mnt/9p/test-okajima/core is truncated: expected core file size >= 225280, found: 76616 I did not see that before. I'm running through this patch vs a fix to kmap_to_page()[1] and I may have gotten the 2 crossed up. So I'm retesting with your test below. Ira [1] https://lore.kernel.org/linux-mm/20221003040427.1082050-1-ira.weiny@intel.com/ > test program > ---------------------------------------- > void f(int n) > { > printf("%d\n", n); > if (!n) > kill(getpid(), SIGQUIT); > f(--n); > } > > int main(int argc, char *argv[]) > { > f(atoi(argv[1])); > return 0; > } > ---------------------------------------- > ulimit -c unlimited > coredump 2 > gdb coredump core > bt > ---------------------------------------- > expected result > kill > f > f > f > main > ---------------------------------------- > actual result > ?? > ?? > ---------------------------------------- > > > J. R. Okajima
On Mon, Oct 03, 2022 at 12:31:36PM -0700, Ira Weiny wrote: > On Mon, Oct 03, 2022 at 07:48:28PM +0900, J. R. Okajima wrote: > > Ira Weiny: > > > On Wed, Sep 28, 2022 at 07:29:43PM +0100, Al Viro wrote: > > > > On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote: > > > > > [I'm going to send a pull request tomorrow if nobody yells; > > > > > please review and test - it seems to work fine here, but extra > > > > > eyes and extra testing would be very welcome] > > > > I tried gdb backtrace 'bt' command with the new core by v6.0, and it > > doesn't show the call trace correctly. Is it related to this commit? > > > > Are you also getting something like this? > > BFD: warning: /mnt/9p/test-okajima/core is truncated: expected core file size >= 225280, found: 76616 > > I did not see that before. I'm running through this patch vs a fix to > kmap_to_page()[1] and I may have gotten the 2 crossed up. So I'm retesting > with your test below. Argh.... Try this: fix coredump breakage caused by badly tested "[coredump] don't use __kernel_write() on kmap_local_page()" Let me count the ways I'd screwed up: * when emitting a page, handling of gaps in coredump should happen before fetching the current file position. * fix for problem that occurs on rather uncommon setups (and hadn't been observed in the wild) sent very late in the cycle. * ... with badly insufficient testing, introducing an easily reproducable breakage. Without giving it time to soak in -next. Fucked-up-by: Al Viro <viro@zeniv.linux.org.uk> Fixes: 06bbaa6dc53c "[coredump] don't use __kernel_write() on kmap_local_page()" Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- diff --git a/fs/coredump.c b/fs/coredump.c index 1ab4f5b76a1e..3538f3a63965 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -841,7 +841,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) }; struct iov_iter iter; struct file *file = cprm->file; - loff_t pos = file->f_pos; + loff_t pos; ssize_t n; if (cprm->to_skip) { @@ -853,6 +853,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) return 0; if (dump_interrupted()) return 0; + pos = file->f_pos; iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE); n = __kernel_write_iter(cprm->file, &iter, &pos); if (n != PAGE_SIZE)
Al Viro: > Argh.... Try this: > > fix coredump breakage caused by badly tested "[coredump] don't use __kernel_write() on kmap_local_page()" Thanx, it passed my local test. > * fix for problem that occurs on rather uncommon setups (and hadn't > been observed in the wild) sent very late in the cycle. If the commit was merged in RC versions, I guess someone found the problem earlier. J. R. Okajima
On Tue, Oct 04, 2022 at 07:58:14AM +0900, J. R. Okajima wrote: > Al Viro: > > Argh.... Try this: > > > > fix coredump breakage caused by badly tested "[coredump] don't use __kernel_write() on kmap_local_page()" > > Thanx, it passed my local test. > > > > * fix for problem that occurs on rather uncommon setups (and hadn't > > been observed in the wild) sent very late in the cycle. > > If the commit was merged in RC versions, I guess someone found the > problem earlier. Most likely - the breakage is really not hard to trigger ;-/ Linus, which way would you prefer to handle that? It's a brown paperbag stuff - the worst I had in quite a while. Mea maxima culpa ;-/ One variant would be to revert the original patch, put its (hopefully) fixed variant into -next and let it sit there for a while. Another is to put this incremental into -next and merge it into mainline once it gets a sane amount of testing. Up to you...
On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote: > > One variant would be to revert the original patch, put its > (hopefully) fixed variant into -next and let it sit there for > a while. Another is to put this incremental into -next and > merge it into mainline once it gets a sane amount of testing. Just do the incremental fix. It looks obvious enough ("oops, we need to get the pos _after_ we've done any skip-lseeks on the core file") that I think it would be just harder to follow a "revert and follow up with a fix". I don't think it needs a ton of extra testing, with Okajima having already confirmed it fixes his problem case.. Linus
On Mon, Oct 03, 2022 at 05:20:03PM -0700, Linus Torvalds wrote: > On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > One variant would be to revert the original patch, put its > > (hopefully) fixed variant into -next and let it sit there for > > a while. Another is to put this incremental into -next and > > merge it into mainline once it gets a sane amount of testing. > > Just do the incremental fix. It looks obvious enough ("oops, we need > to get the pos _after_ we've done any skip-lseeks on the core file") > that I think it would be just harder to follow a "revert and follow up > with a fix". > > I don't think it needs a ton of extra testing, with Okajima having > already confirmed it fixes his problem case.. OK, incremental is in #fixes, pushed out.
On Mon, Oct 3, 2022 at 5:31 PM Al Viro <viro@zeniv.linux.org.uk> wrote: > > OK, incremental is in #fixes, pushed out. I'm assuming I'll still get a proper pull request. No? Linus
On Mon, Oct 03, 2022 at 05:37:24PM -0700, Linus Torvalds wrote: > On Mon, Oct 3, 2022 at 5:31 PM Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > OK, incremental is in #fixes, pushed out. > > I'm assuming I'll still get a proper pull request. No? Pull request follows; after all, whatever else is there, this is an obvious fix for the breakage Okajima caught... Al, very much hoping there's no other embarrassing fuckups lurking in that thing ;-/ The following changes since commit 4fe89d07dcc2804c8b562f6c7896a45643d34b2f: Linux 6.0 (2022-10-02 14:09:07 -0700) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git tags/pull-fixes for you to fetch changes up to 4f526fef91b24197d489ff86789744c67f475bb4: [brown paperbag] fix coredump breakage (2022-10-03 20:28:38 -0400) ---------------------------------------------------------------- fs/coredump fix ---------------------------------------------------------------- Al Viro (1): [brown paperbag] fix coredump breakage fs/coredump.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
On Tue, Oct 04, 2022 at 07:58:14AM +0900, J. R. Okajima wrote: > Al Viro: > > Argh.... Try this: > > > > fix coredump breakage caused by badly tested "[coredump] don't use __kernel_write() on kmap_local_page()" > > Thanx, it passed my local test. My mess up too. I thought I had tested this when I had tested the kmap_to_page() fix. :-( This works for me too now that I have my git trees straight. Sorry. Ira > > > > * fix for problem that occurs on rather uncommon setups (and hadn't > > been observed in the wild) sent very late in the cycle. > > If the commit was merged in RC versions, I guess someone found the > problem earlier. > > > J. R. Okajima
On Tue, Oct 4, 2022 at 2:51 AM Linus Torvalds <torvalds@linux-foundation.org> wrote: > > On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > One variant would be to revert the original patch, put its > > (hopefully) fixed variant into -next and let it sit there for > > a while. Another is to put this incremental into -next and > > merge it into mainline once it gets a sane amount of testing. > > Just do the incremental fix. It looks obvious enough ("oops, we need > to get the pos _after_ we've done any skip-lseeks on the core file") > that I think it would be just harder to follow a "revert and follow up > with a fix". > > I don't think it needs a ton of extra testing, with Okajima having > already confirmed it fixes his problem case.. > > Linus [ CC Geert ] There was another patch from Geert concerning the same coredump changes: [PATCH] coredump: Move dump_emit_page() to kill unused warning If CONFIG_ELF_CORE is not set: fs/coredump.c:835:12: error: ‘dump_emit_page’ defined but not used [-Werror=unused-function] 835 | static int dump_emit_page(struct coredump_params *cprm, struct page *page) | ^~~~~~~~~~~~~~ Fix this by moving dump_emit_page() inside the existing section protected by #ifdef CONFIG_ELF_CORE. Fixes: 06bbaa6dc53cb720 ("[coredump] don't use __kernel_write() on kmap_local_page()") Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org> Please, check yourself! Best regards, -Sedat- [1] https://lore.kernel.org/all/20221003090657.2053236-1-geert@linux-m68k.org/
On Tue, Oct 4, 2022 at 8:18 AM Sedat Dilek <sedat.dilek@gmail.com> wrote: > > On Tue, Oct 4, 2022 at 2:51 AM Linus Torvalds > <torvalds@linux-foundation.org> wrote: > > > > On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > > One variant would be to revert the original patch, put its > > > (hopefully) fixed variant into -next and let it sit there for > > > a while. Another is to put this incremental into -next and > > > merge it into mainline once it gets a sane amount of testing. > > > > Just do the incremental fix. It looks obvious enough ("oops, we need > > to get the pos _after_ we've done any skip-lseeks on the core file") > > that I think it would be just harder to follow a "revert and follow up > > with a fix". > > > > I don't think it needs a ton of extra testing, with Okajima having > > already confirmed it fixes his problem case.. > > > > Linus > > [ CC Geert ] > > There was another patch from Geert concerning the same coredump changes: > > [PATCH] coredump: Move dump_emit_page() to kill unused warning > > If CONFIG_ELF_CORE is not set: > > fs/coredump.c:835:12: error: ‘dump_emit_page’ defined but not used > [-Werror=unused-function] > 835 | static int dump_emit_page(struct coredump_params *cprm, > struct page *page) > | ^~~~~~~~~~~~~~ > > Fix this by moving dump_emit_page() inside the existing section > protected by #ifdef CONFIG_ELF_CORE. > > Fixes: 06bbaa6dc53cb720 ("[coredump] don't use __kernel_write() on > kmap_local_page()") > Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org> > > Please, check yourself! > > Best regards, > -Sedat- > > [1] https://lore.kernel.org/all/20221003090657.2053236-1-geert@linux-m68k.org/ [ CC Ren Zhijie <renzhijie2@huawei.com> ] Looks like the same patch as of Geert. -Sedat- [1] https://lore.kernel.org/all/20221009092420.32850-1-renzhijie2@huawei.com/
Hi Al, On Tue, Oct 4, 2022 at 8:19 AM Sedat Dilek <sedat.dilek@gmail.com> wrote: > On Tue, Oct 4, 2022 at 2:51 AM Linus Torvalds > <torvalds@linux-foundation.org> wrote: > > On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > > One variant would be to revert the original patch, put its > > > (hopefully) fixed variant into -next and let it sit there for > > > a while. Another is to put this incremental into -next and > > > merge it into mainline once it gets a sane amount of testing. > > > > Just do the incremental fix. It looks obvious enough ("oops, we need > > to get the pos _after_ we've done any skip-lseeks on the core file") > > that I think it would be just harder to follow a "revert and follow up > > with a fix". > > > > I don't think it needs a ton of extra testing, with Okajima having > > already confirmed it fixes his problem case.. > > > > Linus > > [ CC Geert ] > > There was another patch from Geert concerning the same coredump changes: > > [PATCH] coredump: Move dump_emit_page() to kill unused warning > > If CONFIG_ELF_CORE is not set: > > fs/coredump.c:835:12: error: ‘dump_emit_page’ defined but not used > [-Werror=unused-function] > 835 | static int dump_emit_page(struct coredump_params *cprm, > struct page *page) > | ^~~~~~~~~~~~~~ > > Fix this by moving dump_emit_page() inside the existing section > protected by #ifdef CONFIG_ELF_CORE. > > Fixes: 06bbaa6dc53cb720 ("[coredump] don't use __kernel_write() on > kmap_local_page()") > Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org> > > Please, check yourself! The build issue is still present in today's linux-next. Al, can you please apply my fix, so Greg can backport all of this to stable? https://lore.kernel.org/all/YzxxtFSCEsycgXSK@kroah.com Thanks! > [1] https://lore.kernel.org/all/20221003090657.2053236-1-geert@linux-m68k.org/ Gr{oetje,eeting}s, Geert -- Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org In personal conversations with technical people, I call myself a hacker. But when I'm talking to journalists I just say "programmer" or something like that. -- Linus Torvalds
diff --git a/fs/coredump.c b/fs/coredump.c index 9f4aae202109..09dbc981ad5c 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -832,6 +832,38 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) } } +static int dump_emit_page(struct coredump_params *cprm, struct page *page) +{ + struct bio_vec bvec = { + .bv_page = page, + .bv_offset = 0, + .bv_len = PAGE_SIZE, + }; + struct iov_iter iter; + struct file *file = cprm->file; + loff_t pos = file->f_pos; + ssize_t n; + + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + if (cprm->written + PAGE_SIZE > cprm->limit) + return 0; + if (dump_interrupted()) + return 0; + iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE); + n = __kernel_write_iter(cprm->file, &iter, &pos); + if (n != PAGE_SIZE) + return 0; + file->f_pos = pos; + cprm->written += PAGE_SIZE; + cprm->pos += PAGE_SIZE; + + return 1; +} + int dump_emit(struct coredump_params *cprm, const void *addr, int nr) { if (cprm->to_skip) { @@ -863,7 +895,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; - int stop; /* * To avoid having to allocate page tables for virtual address @@ -874,12 +905,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - void *kaddr = kmap_local_page(page); - - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap_local(kaddr); - put_page(page); - if (stop) + if (!dump_emit_page(cprm, page)) return 0; } else { dump_skip(cprm, PAGE_SIZE); diff --git a/fs/internal.h b/fs/internal.h index 87e96b9024ce..3e206d3e317c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -16,6 +16,7 @@ struct shrink_control; struct fs_context; struct user_namespace; struct pipe_inode_info; +struct iov_iter; /* * block/bdev.c @@ -221,3 +222,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns, int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct xattr_ctx *ctx); + +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); diff --git a/fs/read_write.c b/fs/read_write.c index 1a261dcf1778..328ce8cf9a85 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t } /* caller is responsible for file_start_write/file_end_write */ -ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { - struct kvec iov = { - .iov_base = (void *)buf, - .iov_len = min_t(size_t, count, MAX_RW_COUNT), - }; struct kiocb kiocb; - struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) @@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; - iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); - ret = file->f_op->write_iter(&kiocb, &iter); + ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; @@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t inc_syscw(current); return ret; } + +/* caller is responsible for file_start_write/file_end_write */ +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +{ + struct kvec iov = { + .iov_base = (void *)buf, + .iov_len = min_t(size_t, count, MAX_RW_COUNT), + }; + struct iov_iter iter; + iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); + return __kernel_write_iter(file, &iter, pos); +} /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually
[I'm going to send a pull request tomorrow if nobody yells; please review and test - it seems to work fine here, but extra eyes and extra testing would be very welcome] passing kmap_local_page() result to __kernel_write() is unsafe - random ->write_iter() might (and 9p one does) get unhappy when passed ITER_KVEC with pointer that came from kmap_local_page(). Fix by providing a variant of __kernel_write() that takes an iov_iter from caller (__kernel_write() becomes a trivial wrapper) and adding dump_emit_page() that parallels dump_emit(), except that instead of __kernel_write() it uses __kernel_write_iter() with ITER_BVEC source. Fixes: 3159ed57792b "fs/coredump: use kmap_local_page()" Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> ---