diff mbox series

[CFT,coredump] don't use __kernel_write() on kmap_local_page()

Message ID YzN+ZYLjK6HI1P1C@ZenIV (mailing list archive)
State New, archived
Headers show
Series [CFT,coredump] don't use __kernel_write() on kmap_local_page() | expand

Commit Message

Al Viro Sept. 27, 2022, 10:51 p.m. UTC
[I'm going to send a pull request tomorrow if nobody yells;
please review and test - it seems to work fine here, but extra
eyes and extra testing would be very welcome]

passing kmap_local_page() result to __kernel_write() is unsafe -
random ->write_iter() might (and 9p one does) get unhappy when
passed ITER_KVEC with pointer that came from kmap_local_page().
    
Fix by providing a variant of __kernel_write() that takes an iov_iter
from caller (__kernel_write() becomes a trivial wrapper) and adding
dump_emit_page() that parallels dump_emit(), except that instead of
__kernel_write() it uses __kernel_write_iter() with ITER_BVEC source.
  
Fixes: 3159ed57792b "fs/coredump: use kmap_local_page()"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---

Comments

Eric W. Biederman Sept. 28, 2022, 5:57 p.m. UTC | #1
Al Viro <viro@zeniv.linux.org.uk> writes:

> [I'm going to send a pull request tomorrow if nobody yells;
> please review and test - it seems to work fine here, but extra
> eyes and extra testing would be very welcome]
>
> passing kmap_local_page() result to __kernel_write() is unsafe -
> random ->write_iter() might (and 9p one does) get unhappy when
> passed ITER_KVEC with pointer that came from kmap_local_page().
>     
> Fix by providing a variant of __kernel_write() that takes an iov_iter
> from caller (__kernel_write() becomes a trivial wrapper) and adding
> dump_emit_page() that parallels dump_emit(), except that instead of
> __kernel_write() it uses __kernel_write_iter() with ITER_BVEC source.

Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>

From a 10,000 foot view this makes sense.


> Fixes: 3159ed57792b "fs/coredump: use kmap_local_page()"
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---
> diff --git a/fs/coredump.c b/fs/coredump.c
> index 9f4aae202109..09dbc981ad5c 100644
> --- a/fs/coredump.c
> +++ b/fs/coredump.c
> @@ -832,6 +832,38 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
>  	}
>  }
>  
> +static int dump_emit_page(struct coredump_params *cprm, struct page *page)
> +{
> +	struct bio_vec bvec = {
> +		.bv_page	= page,
> +		.bv_offset	= 0,
> +		.bv_len		= PAGE_SIZE,
> +	};
> +	struct iov_iter iter;
> +	struct file *file = cprm->file;
> +	loff_t pos = file->f_pos;
> +	ssize_t n;
> +
> +	if (cprm->to_skip) {
> +		if (!__dump_skip(cprm, cprm->to_skip))
> +			return 0;
> +		cprm->to_skip = 0;
> +	}
> +	if (cprm->written + PAGE_SIZE > cprm->limit)
> +		return 0;
> +	if (dump_interrupted())
> +		return 0;
> +	iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
> +	n = __kernel_write_iter(cprm->file, &iter, &pos);
> +	if (n != PAGE_SIZE)
> +		return 0;
> +	file->f_pos = pos;
> +	cprm->written += PAGE_SIZE;
> +	cprm->pos += PAGE_SIZE;
> +
> +	return 1;
> +}
> +
>  int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
>  {
>  	if (cprm->to_skip) {
> @@ -863,7 +895,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
>  
>  	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
>  		struct page *page;
> -		int stop;
>  
>  		/*
>  		 * To avoid having to allocate page tables for virtual address
> @@ -874,12 +905,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
>  		 */
>  		page = get_dump_page(addr);
>  		if (page) {
> -			void *kaddr = kmap_local_page(page);
> -
> -			stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
> -			kunmap_local(kaddr);
> -			put_page(page);
> -			if (stop)
> +			if (!dump_emit_page(cprm, page))
>  				return 0;
>  		} else {
>  			dump_skip(cprm, PAGE_SIZE);
> diff --git a/fs/internal.h b/fs/internal.h
> index 87e96b9024ce..3e206d3e317c 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -16,6 +16,7 @@ struct shrink_control;
>  struct fs_context;
>  struct user_namespace;
>  struct pipe_inode_info;
> +struct iov_iter;
>  
>  /*
>   * block/bdev.c
> @@ -221,3 +222,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns,
>  int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
>  int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
>  		struct xattr_ctx *ctx);
> +
> +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 1a261dcf1778..328ce8cf9a85 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
>  }
>  
>  /* caller is responsible for file_start_write/file_end_write */
> -ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
> +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
>  {
> -	struct kvec iov = {
> -		.iov_base	= (void *)buf,
> -		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
> -	};
>  	struct kiocb kiocb;
> -	struct iov_iter iter;
>  	ssize_t ret;
>  
>  	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
> @@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
>  
>  	init_sync_kiocb(&kiocb, file);
>  	kiocb.ki_pos = pos ? *pos : 0;
> -	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
> -	ret = file->f_op->write_iter(&kiocb, &iter);
> +	ret = file->f_op->write_iter(&kiocb, from);
>  	if (ret > 0) {
>  		if (pos)
>  			*pos = kiocb.ki_pos;
> @@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
>  	inc_syscw(current);
>  	return ret;
>  }
> +
> +/* caller is responsible for file_start_write/file_end_write */
> +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
> +{
> +	struct kvec iov = {
> +		.iov_base	= (void *)buf,
> +		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
> +	};
> +	struct iov_iter iter;
> +	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
> +	return __kernel_write_iter(file, &iter, pos);
> +}
>  /*
>   * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
>   * but autofs is one of the few internal kernel users that actually
Al Viro Sept. 28, 2022, 6:29 p.m. UTC | #2
On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote:
> [I'm going to send a pull request tomorrow if nobody yells;
> please review and test - it seems to work fine here, but extra
> eyes and extra testing would be very welcome]

... and now with page leak fixed:

commit 06bbaa6dc53cb72040db952053432541acb9adc7
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Mon Sep 26 11:59:14 2022 -0400

    [coredump] don't use __kernel_write() on kmap_local_page()
    
    passing kmap_local_page() result to __kernel_write() is unsafe -
    random ->write_iter() might (and 9p one does) get unhappy when
    passed ITER_KVEC with pointer that came from kmap_local_page().
    
    Fix by providing a variant of __kernel_write() that takes an iov_iter
    from caller (__kernel_write() becomes a trivial wrapper) and adding
    dump_emit_page() that parallels dump_emit(), except that instead of
    __kernel_write() it uses __kernel_write_iter() with ITER_BVEC source.
    
    Fixes: 3159ed57792b "fs/coredump: use kmap_local_page()"
    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/coredump.c b/fs/coredump.c
index 9f4aae202109..1ab4f5b76a1e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -832,6 +832,38 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
 	}
 }
 
+static int dump_emit_page(struct coredump_params *cprm, struct page *page)
+{
+	struct bio_vec bvec = {
+		.bv_page	= page,
+		.bv_offset	= 0,
+		.bv_len		= PAGE_SIZE,
+	};
+	struct iov_iter iter;
+	struct file *file = cprm->file;
+	loff_t pos = file->f_pos;
+	ssize_t n;
+
+	if (cprm->to_skip) {
+		if (!__dump_skip(cprm, cprm->to_skip))
+			return 0;
+		cprm->to_skip = 0;
+	}
+	if (cprm->written + PAGE_SIZE > cprm->limit)
+		return 0;
+	if (dump_interrupted())
+		return 0;
+	iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
+	n = __kernel_write_iter(cprm->file, &iter, &pos);
+	if (n != PAGE_SIZE)
+		return 0;
+	file->f_pos = pos;
+	cprm->written += PAGE_SIZE;
+	cprm->pos += PAGE_SIZE;
+
+	return 1;
+}
+
 int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 {
 	if (cprm->to_skip) {
@@ -863,7 +895,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 
 	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
 		struct page *page;
-		int stop;
 
 		/*
 		 * To avoid having to allocate page tables for virtual address
@@ -874,10 +905,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		 */
 		page = get_dump_page(addr);
 		if (page) {
-			void *kaddr = kmap_local_page(page);
-
-			stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
-			kunmap_local(kaddr);
+			int stop = !dump_emit_page(cprm, page);
 			put_page(page);
 			if (stop)
 				return 0;
diff --git a/fs/internal.h b/fs/internal.h
index 87e96b9024ce..3e206d3e317c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -16,6 +16,7 @@ struct shrink_control;
 struct fs_context;
 struct user_namespace;
 struct pipe_inode_info;
+struct iov_iter;
 
 /*
  * block/bdev.c
@@ -221,3 +222,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns,
 int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
 int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct xattr_ctx *ctx);
+
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
diff --git a/fs/read_write.c b/fs/read_write.c
index 1a261dcf1778..328ce8cf9a85 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 }
 
 /* caller is responsible for file_start_write/file_end_write */
-ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
 {
-	struct kvec iov = {
-		.iov_base	= (void *)buf,
-		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
-	};
 	struct kiocb kiocb;
-	struct iov_iter iter;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
@@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
 
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = pos ? *pos : 0;
-	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
-	ret = file->f_op->write_iter(&kiocb, &iter);
+	ret = file->f_op->write_iter(&kiocb, from);
 	if (ret > 0) {
 		if (pos)
 			*pos = kiocb.ki_pos;
@@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
 	inc_syscw(current);
 	return ret;
 }
+
+/* caller is responsible for file_start_write/file_end_write */
+ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+{
+	struct kvec iov = {
+		.iov_base	= (void *)buf,
+		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
+	};
+	struct iov_iter iter;
+	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+	return __kernel_write_iter(file, &iter, pos);
+}
 /*
  * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
  * but autofs is one of the few internal kernel users that actually
Ira Weiny Oct. 3, 2022, 3:51 a.m. UTC | #3
On Wed, Sep 28, 2022 at 07:29:43PM +0100, Al Viro wrote:
> On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote:
> > [I'm going to send a pull request tomorrow if nobody yells;
> > please review and test - it seems to work fine here, but extra
> > eyes and extra testing would be very welcome]
> 
> ... and now with page leak fixed:
> 
> commit 06bbaa6dc53cb72040db952053432541acb9adc7
> Author: Al Viro <viro@zeniv.linux.org.uk>
> Date:   Mon Sep 26 11:59:14 2022 -0400
> 
>     [coredump] don't use __kernel_write() on kmap_local_page()
>     
>     passing kmap_local_page() result to __kernel_write() is unsafe -
>     random ->write_iter() might (and 9p one does) get unhappy when
>     passed ITER_KVEC with pointer that came from kmap_local_page().
>     
>     Fix by providing a variant of __kernel_write() that takes an iov_iter
>     from caller (__kernel_write() becomes a trivial wrapper) and adding
>     dump_emit_page() that parallels dump_emit(), except that instead of
>     __kernel_write() it uses __kernel_write_iter() with ITER_BVEC source.
>     
>     Fixes: 3159ed57792b "fs/coredump: use kmap_local_page()"
>     Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Tested-by: Ira Weiny <ira.weiny@intel.com>

> 
> diff --git a/fs/coredump.c b/fs/coredump.c
> index 9f4aae202109..1ab4f5b76a1e 100644
> --- a/fs/coredump.c
> +++ b/fs/coredump.c
> @@ -832,6 +832,38 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
>  	}
>  }
>  
> +static int dump_emit_page(struct coredump_params *cprm, struct page *page)
> +{
> +	struct bio_vec bvec = {
> +		.bv_page	= page,
> +		.bv_offset	= 0,
> +		.bv_len		= PAGE_SIZE,
> +	};
> +	struct iov_iter iter;
> +	struct file *file = cprm->file;
> +	loff_t pos = file->f_pos;
> +	ssize_t n;
> +
> +	if (cprm->to_skip) {
> +		if (!__dump_skip(cprm, cprm->to_skip))
> +			return 0;
> +		cprm->to_skip = 0;
> +	}
> +	if (cprm->written + PAGE_SIZE > cprm->limit)
> +		return 0;
> +	if (dump_interrupted())
> +		return 0;
> +	iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
> +	n = __kernel_write_iter(cprm->file, &iter, &pos);
> +	if (n != PAGE_SIZE)
> +		return 0;
> +	file->f_pos = pos;
> +	cprm->written += PAGE_SIZE;
> +	cprm->pos += PAGE_SIZE;
> +
> +	return 1;
> +}
> +
>  int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
>  {
>  	if (cprm->to_skip) {
> @@ -863,7 +895,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
>  
>  	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
>  		struct page *page;
> -		int stop;
>  
>  		/*
>  		 * To avoid having to allocate page tables for virtual address
> @@ -874,10 +905,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
>  		 */
>  		page = get_dump_page(addr);
>  		if (page) {
> -			void *kaddr = kmap_local_page(page);
> -
> -			stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
> -			kunmap_local(kaddr);
> +			int stop = !dump_emit_page(cprm, page);
>  			put_page(page);
>  			if (stop)
>  				return 0;
> diff --git a/fs/internal.h b/fs/internal.h
> index 87e96b9024ce..3e206d3e317c 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -16,6 +16,7 @@ struct shrink_control;
>  struct fs_context;
>  struct user_namespace;
>  struct pipe_inode_info;
> +struct iov_iter;
>  
>  /*
>   * block/bdev.c
> @@ -221,3 +222,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns,
>  int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
>  int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
>  		struct xattr_ctx *ctx);
> +
> +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 1a261dcf1778..328ce8cf9a85 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
>  }
>  
>  /* caller is responsible for file_start_write/file_end_write */
> -ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
> +ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
>  {
> -	struct kvec iov = {
> -		.iov_base	= (void *)buf,
> -		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
> -	};
>  	struct kiocb kiocb;
> -	struct iov_iter iter;
>  	ssize_t ret;
>  
>  	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
> @@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
>  
>  	init_sync_kiocb(&kiocb, file);
>  	kiocb.ki_pos = pos ? *pos : 0;
> -	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
> -	ret = file->f_op->write_iter(&kiocb, &iter);
> +	ret = file->f_op->write_iter(&kiocb, from);
>  	if (ret > 0) {
>  		if (pos)
>  			*pos = kiocb.ki_pos;
> @@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
>  	inc_syscw(current);
>  	return ret;
>  }
> +
> +/* caller is responsible for file_start_write/file_end_write */
> +ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
> +{
> +	struct kvec iov = {
> +		.iov_base	= (void *)buf,
> +		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
> +	};
> +	struct iov_iter iter;
> +	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
> +	return __kernel_write_iter(file, &iter, pos);
> +}
>  /*
>   * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
>   * but autofs is one of the few internal kernel users that actually
>
J. R. Okajima Oct. 3, 2022, 10:48 a.m. UTC | #4
Ira Weiny:
> On Wed, Sep 28, 2022 at 07:29:43PM +0100, Al Viro wrote:
> > On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote:
> > > [I'm going to send a pull request tomorrow if nobody yells;
> > > please review and test - it seems to work fine here, but extra
> > > eyes and extra testing would be very welcome]

I tried gdb backtrace 'bt' command with the new core by v6.0, and it
doesn't show the call trace correctly. Is it related to this commit?

test program
----------------------------------------
void f(int n)
{
	printf("%d\n", n);
	if (!n)
		kill(getpid(), SIGQUIT);
	f(--n);
}

int main(int argc, char *argv[])
{
	f(atoi(argv[1]));
	return 0;
}
----------------------------------------
ulimit -c unlimited
coredump 2
gdb coredump core
bt
----------------------------------------
expected result
kill
f
f
f
main
----------------------------------------
actual result
??
??
----------------------------------------


J. R. Okajima
Ira Weiny Oct. 3, 2022, 7:31 p.m. UTC | #5
On Mon, Oct 03, 2022 at 07:48:28PM +0900, J. R. Okajima wrote:
> Ira Weiny:
> > On Wed, Sep 28, 2022 at 07:29:43PM +0100, Al Viro wrote:
> > > On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote:
> > > > [I'm going to send a pull request tomorrow if nobody yells;
> > > > please review and test - it seems to work fine here, but extra
> > > > eyes and extra testing would be very welcome]
> 
> I tried gdb backtrace 'bt' command with the new core by v6.0, and it
> doesn't show the call trace correctly. Is it related to this commit?
>

Are you also getting something like this?

BFD: warning: /mnt/9p/test-okajima/core is truncated: expected core file size >= 225280, found: 76616

I did not see that before.  I'm running through this patch vs a fix to
kmap_to_page()[1] and I may have gotten the 2 crossed up.  So I'm retesting
with your test below.

Ira

[1] https://lore.kernel.org/linux-mm/20221003040427.1082050-1-ira.weiny@intel.com/
 
> test program
> ----------------------------------------
> void f(int n)
> {
> 	printf("%d\n", n);
> 	if (!n)
> 		kill(getpid(), SIGQUIT);
> 	f(--n);
> }
> 
> int main(int argc, char *argv[])
> {
> 	f(atoi(argv[1]));
> 	return 0;
> }
> ----------------------------------------
> ulimit -c unlimited
> coredump 2
> gdb coredump core
> bt
> ----------------------------------------
> expected result
> kill
> f
> f
> f
> main
> ----------------------------------------
> actual result
> ??
> ??
> ----------------------------------------
> 
> 
> J. R. Okajima
Al Viro Oct. 3, 2022, 10:18 p.m. UTC | #6
On Mon, Oct 03, 2022 at 12:31:36PM -0700, Ira Weiny wrote:
> On Mon, Oct 03, 2022 at 07:48:28PM +0900, J. R. Okajima wrote:
> > Ira Weiny:
> > > On Wed, Sep 28, 2022 at 07:29:43PM +0100, Al Viro wrote:
> > > > On Tue, Sep 27, 2022 at 11:51:17PM +0100, Al Viro wrote:
> > > > > [I'm going to send a pull request tomorrow if nobody yells;
> > > > > please review and test - it seems to work fine here, but extra
> > > > > eyes and extra testing would be very welcome]
> > 
> > I tried gdb backtrace 'bt' command with the new core by v6.0, and it
> > doesn't show the call trace correctly. Is it related to this commit?
> >
> 
> Are you also getting something like this?
> 
> BFD: warning: /mnt/9p/test-okajima/core is truncated: expected core file size >= 225280, found: 76616
> 
> I did not see that before.  I'm running through this patch vs a fix to
> kmap_to_page()[1] and I may have gotten the 2 crossed up.  So I'm retesting
> with your test below.

Argh....  Try this:

fix coredump breakage caused by badly tested "[coredump] don't use __kernel_write() on kmap_local_page()"

Let me count the ways I'd screwed up:

* when emitting a page, handling of gaps in coredump should happen
before fetching the current file position.
* fix for problem that occurs on rather uncommon setups (and hadn't
been observed in the wild) sent very late in the cycle.
* ... with badly insufficient testing, introducing an easily
reproducable breakage.  Without giving it time to soak in -next.

Fucked-up-by: Al Viro <viro@zeniv.linux.org.uk>
Fixes: 06bbaa6dc53c "[coredump] don't use __kernel_write() on kmap_local_page()"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/coredump.c b/fs/coredump.c
index 1ab4f5b76a1e..3538f3a63965 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -841,7 +841,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	};
 	struct iov_iter iter;
 	struct file *file = cprm->file;
-	loff_t pos = file->f_pos;
+	loff_t pos;
 	ssize_t n;
 
 	if (cprm->to_skip) {
@@ -853,6 +853,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 		return 0;
 	if (dump_interrupted())
 		return 0;
+	pos = file->f_pos;
 	iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
 	n = __kernel_write_iter(cprm->file, &iter, &pos);
 	if (n != PAGE_SIZE)
J. R. Okajima Oct. 3, 2022, 10:58 p.m. UTC | #7
Al Viro:
> Argh....  Try this:
>
> fix coredump breakage caused by badly tested "[coredump] don't use __kernel_write() on kmap_local_page()"

Thanx, it passed my local test.


> * fix for problem that occurs on rather uncommon setups (and hadn't
> been observed in the wild) sent very late in the cycle.

If the commit was merged in RC versions, I guess someone found the
problem earlier.


J. R. Okajima
Al Viro Oct. 3, 2022, 11:37 p.m. UTC | #8
On Tue, Oct 04, 2022 at 07:58:14AM +0900, J. R. Okajima wrote:
> Al Viro:
> > Argh....  Try this:
> >
> > fix coredump breakage caused by badly tested "[coredump] don't use __kernel_write() on kmap_local_page()"
> 
> Thanx, it passed my local test.
> 
> 
> > * fix for problem that occurs on rather uncommon setups (and hadn't
> > been observed in the wild) sent very late in the cycle.
> 
> If the commit was merged in RC versions, I guess someone found the
> problem earlier.

Most likely - the breakage is really not hard to trigger ;-/

	Linus, which way would you prefer to handle that?  It's
a brown paperbag stuff - the worst I had in quite a while.
Mea maxima culpa ;-/

One variant would be to revert the original patch, put its
(hopefully) fixed variant into -next and let it sit there for
a while.  Another is to put this incremental into -next and
merge it into mainline once it gets a sane amount of testing.

	Up to you...
Linus Torvalds Oct. 4, 2022, 12:20 a.m. UTC | #9
On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> One variant would be to revert the original patch, put its
> (hopefully) fixed variant into -next and let it sit there for
> a while.  Another is to put this incremental into -next and
> merge it into mainline once it gets a sane amount of testing.

Just do the incremental fix. It looks obvious enough ("oops, we need
to get the pos _after_ we've done any skip-lseeks on the core file")
that I think it would be just harder to follow a "revert and follow up
with a fix".

I don't think it needs a ton of extra testing, with Okajima having
already confirmed it fixes his problem case..

                Linus
Al Viro Oct. 4, 2022, 12:31 a.m. UTC | #10
On Mon, Oct 03, 2022 at 05:20:03PM -0700, Linus Torvalds wrote:
> On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > One variant would be to revert the original patch, put its
> > (hopefully) fixed variant into -next and let it sit there for
> > a while.  Another is to put this incremental into -next and
> > merge it into mainline once it gets a sane amount of testing.
> 
> Just do the incremental fix. It looks obvious enough ("oops, we need
> to get the pos _after_ we've done any skip-lseeks on the core file")
> that I think it would be just harder to follow a "revert and follow up
> with a fix".
> 
> I don't think it needs a ton of extra testing, with Okajima having
> already confirmed it fixes his problem case..

OK, incremental is in #fixes, pushed out.
Linus Torvalds Oct. 4, 2022, 12:37 a.m. UTC | #11
On Mon, Oct 3, 2022 at 5:31 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> OK, incremental is in #fixes, pushed out.

I'm assuming I'll still get a proper pull request. No?

              Linus
Al Viro Oct. 4, 2022, 12:52 a.m. UTC | #12
On Mon, Oct 03, 2022 at 05:37:24PM -0700, Linus Torvalds wrote:
> On Mon, Oct 3, 2022 at 5:31 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > OK, incremental is in #fixes, pushed out.
> 
> I'm assuming I'll still get a proper pull request. No?

Pull request follows; after all, whatever else is there, this is an obvious
fix for the breakage Okajima caught...

Al, very much hoping there's no other embarrassing fuckups lurking in that
thing ;-/

The following changes since commit 4fe89d07dcc2804c8b562f6c7896a45643d34b2f:

  Linux 6.0 (2022-10-02 14:09:07 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git tags/pull-fixes

for you to fetch changes up to 4f526fef91b24197d489ff86789744c67f475bb4:

  [brown paperbag] fix coredump breakage (2022-10-03 20:28:38 -0400)

----------------------------------------------------------------
fs/coredump fix

----------------------------------------------------------------
Al Viro (1):
      [brown paperbag] fix coredump breakage

 fs/coredump.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
Ira Weiny Oct. 4, 2022, 2:17 a.m. UTC | #13
On Tue, Oct 04, 2022 at 07:58:14AM +0900, J. R. Okajima wrote:
> Al Viro:
> > Argh....  Try this:
> >
> > fix coredump breakage caused by badly tested "[coredump] don't use __kernel_write() on kmap_local_page()"
> 
> Thanx, it passed my local test.

My mess up too.  I thought I had tested this when I had tested the
kmap_to_page() fix.

:-(

This works for me too now that I have my git trees straight.

Sorry.
Ira

> 
> 
> > * fix for problem that occurs on rather uncommon setups (and hadn't
> > been observed in the wild) sent very late in the cycle.
> 
> If the commit was merged in RC versions, I guess someone found the
> problem earlier.
> 
> 
> J. R. Okajima
Sedat Dilek Oct. 4, 2022, 6:18 a.m. UTC | #14
On Tue, Oct 4, 2022 at 2:51 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > One variant would be to revert the original patch, put its
> > (hopefully) fixed variant into -next and let it sit there for
> > a while.  Another is to put this incremental into -next and
> > merge it into mainline once it gets a sane amount of testing.
>
> Just do the incremental fix. It looks obvious enough ("oops, we need
> to get the pos _after_ we've done any skip-lseeks on the core file")
> that I think it would be just harder to follow a "revert and follow up
> with a fix".
>
> I don't think it needs a ton of extra testing, with Okajima having
> already confirmed it fixes his problem case..
>
>                 Linus

[ CC Geert ]

There was another patch from Geert concerning the same coredump changes:

[PATCH] coredump: Move dump_emit_page() to kill unused warning

If CONFIG_ELF_CORE is not set:

    fs/coredump.c:835:12: error: ‘dump_emit_page’ defined but not used
[-Werror=unused-function]
      835 | static int dump_emit_page(struct coredump_params *cprm,
struct page *page)
          |            ^~~~~~~~~~~~~~

Fix this by moving dump_emit_page() inside the existing section
protected by #ifdef CONFIG_ELF_CORE.

Fixes: 06bbaa6dc53cb720 ("[coredump] don't use __kernel_write() on
kmap_local_page()")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>

Please, check yourself!

Best regards,
-Sedat-

[1] https://lore.kernel.org/all/20221003090657.2053236-1-geert@linux-m68k.org/
Sedat Dilek Oct. 9, 2022, 10:37 a.m. UTC | #15
On Tue, Oct 4, 2022 at 8:18 AM Sedat Dilek <sedat.dilek@gmail.com> wrote:
>
> On Tue, Oct 4, 2022 at 2:51 AM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > One variant would be to revert the original patch, put its
> > > (hopefully) fixed variant into -next and let it sit there for
> > > a while.  Another is to put this incremental into -next and
> > > merge it into mainline once it gets a sane amount of testing.
> >
> > Just do the incremental fix. It looks obvious enough ("oops, we need
> > to get the pos _after_ we've done any skip-lseeks on the core file")
> > that I think it would be just harder to follow a "revert and follow up
> > with a fix".
> >
> > I don't think it needs a ton of extra testing, with Okajima having
> > already confirmed it fixes his problem case..
> >
> >                 Linus
>
> [ CC Geert ]
>
> There was another patch from Geert concerning the same coredump changes:
>
> [PATCH] coredump: Move dump_emit_page() to kill unused warning
>
> If CONFIG_ELF_CORE is not set:
>
>     fs/coredump.c:835:12: error: ‘dump_emit_page’ defined but not used
> [-Werror=unused-function]
>       835 | static int dump_emit_page(struct coredump_params *cprm,
> struct page *page)
>           |            ^~~~~~~~~~~~~~
>
> Fix this by moving dump_emit_page() inside the existing section
> protected by #ifdef CONFIG_ELF_CORE.
>
> Fixes: 06bbaa6dc53cb720 ("[coredump] don't use __kernel_write() on
> kmap_local_page()")
> Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
>
> Please, check yourself!
>
> Best regards,
> -Sedat-
>
> [1] https://lore.kernel.org/all/20221003090657.2053236-1-geert@linux-m68k.org/

[ CC Ren Zhijie <renzhijie2@huawei.com> ]

Looks like the same patch as of Geert.

-Sedat-

[1] https://lore.kernel.org/all/20221009092420.32850-1-renzhijie2@huawei.com/
Geert Uytterhoeven Nov. 9, 2022, 1:57 p.m. UTC | #16
Hi Al,

On Tue, Oct 4, 2022 at 8:19 AM Sedat Dilek <sedat.dilek@gmail.com> wrote:
> On Tue, Oct 4, 2022 at 2:51 AM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> > On Mon, Oct 3, 2022 at 4:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > One variant would be to revert the original patch, put its
> > > (hopefully) fixed variant into -next and let it sit there for
> > > a while.  Another is to put this incremental into -next and
> > > merge it into mainline once it gets a sane amount of testing.
> >
> > Just do the incremental fix. It looks obvious enough ("oops, we need
> > to get the pos _after_ we've done any skip-lseeks on the core file")
> > that I think it would be just harder to follow a "revert and follow up
> > with a fix".
> >
> > I don't think it needs a ton of extra testing, with Okajima having
> > already confirmed it fixes his problem case..
> >
> >                 Linus
>
> [ CC Geert ]
>
> There was another patch from Geert concerning the same coredump changes:
>
> [PATCH] coredump: Move dump_emit_page() to kill unused warning
>
> If CONFIG_ELF_CORE is not set:
>
>     fs/coredump.c:835:12: error: ‘dump_emit_page’ defined but not used
> [-Werror=unused-function]
>       835 | static int dump_emit_page(struct coredump_params *cprm,
> struct page *page)
>           |            ^~~~~~~~~~~~~~
>
> Fix this by moving dump_emit_page() inside the existing section
> protected by #ifdef CONFIG_ELF_CORE.
>
> Fixes: 06bbaa6dc53cb720 ("[coredump] don't use __kernel_write() on
> kmap_local_page()")
> Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
>
> Please, check yourself!

The build issue is still present in today's linux-next.
Al, can you please apply my fix, so Greg can backport all of this to stable?
https://lore.kernel.org/all/YzxxtFSCEsycgXSK@kroah.com

Thanks!

> [1] https://lore.kernel.org/all/20221003090657.2053236-1-geert@linux-m68k.org/

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
diff mbox series

Patch

diff --git a/fs/coredump.c b/fs/coredump.c
index 9f4aae202109..09dbc981ad5c 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -832,6 +832,38 @@  static int __dump_skip(struct coredump_params *cprm, size_t nr)
 	}
 }
 
+static int dump_emit_page(struct coredump_params *cprm, struct page *page)
+{
+	struct bio_vec bvec = {
+		.bv_page	= page,
+		.bv_offset	= 0,
+		.bv_len		= PAGE_SIZE,
+	};
+	struct iov_iter iter;
+	struct file *file = cprm->file;
+	loff_t pos = file->f_pos;
+	ssize_t n;
+
+	if (cprm->to_skip) {
+		if (!__dump_skip(cprm, cprm->to_skip))
+			return 0;
+		cprm->to_skip = 0;
+	}
+	if (cprm->written + PAGE_SIZE > cprm->limit)
+		return 0;
+	if (dump_interrupted())
+		return 0;
+	iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
+	n = __kernel_write_iter(cprm->file, &iter, &pos);
+	if (n != PAGE_SIZE)
+		return 0;
+	file->f_pos = pos;
+	cprm->written += PAGE_SIZE;
+	cprm->pos += PAGE_SIZE;
+
+	return 1;
+}
+
 int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 {
 	if (cprm->to_skip) {
@@ -863,7 +895,6 @@  int dump_user_range(struct coredump_params *cprm, unsigned long start,
 
 	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
 		struct page *page;
-		int stop;
 
 		/*
 		 * To avoid having to allocate page tables for virtual address
@@ -874,12 +905,7 @@  int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		 */
 		page = get_dump_page(addr);
 		if (page) {
-			void *kaddr = kmap_local_page(page);
-
-			stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
-			kunmap_local(kaddr);
-			put_page(page);
-			if (stop)
+			if (!dump_emit_page(cprm, page))
 				return 0;
 		} else {
 			dump_skip(cprm, PAGE_SIZE);
diff --git a/fs/internal.h b/fs/internal.h
index 87e96b9024ce..3e206d3e317c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -16,6 +16,7 @@  struct shrink_control;
 struct fs_context;
 struct user_namespace;
 struct pipe_inode_info;
+struct iov_iter;
 
 /*
  * block/bdev.c
@@ -221,3 +222,5 @@  ssize_t do_getxattr(struct user_namespace *mnt_userns,
 int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
 int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct xattr_ctx *ctx);
+
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
diff --git a/fs/read_write.c b/fs/read_write.c
index 1a261dcf1778..328ce8cf9a85 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -496,14 +496,9 @@  static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 }
 
 /* caller is responsible for file_start_write/file_end_write */
-ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
 {
-	struct kvec iov = {
-		.iov_base	= (void *)buf,
-		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
-	};
 	struct kiocb kiocb;
-	struct iov_iter iter;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
@@ -519,8 +514,7 @@  ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
 
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = pos ? *pos : 0;
-	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
-	ret = file->f_op->write_iter(&kiocb, &iter);
+	ret = file->f_op->write_iter(&kiocb, from);
 	if (ret > 0) {
 		if (pos)
 			*pos = kiocb.ki_pos;
@@ -530,6 +524,18 @@  ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
 	inc_syscw(current);
 	return ret;
 }
+
+/* caller is responsible for file_start_write/file_end_write */
+ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+{
+	struct kvec iov = {
+		.iov_base	= (void *)buf,
+		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
+	};
+	struct iov_iter iter;
+	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+	return __kernel_write_iter(file, &iter, pos);
+}
 /*
  * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
  * but autofs is one of the few internal kernel users that actually