diff mbox series

[RFC] fs/exec: Add the support for ELF program's NUMA replication

Message ID 20210906161613.4249-1-shijie@os.amperecomputing.com (mailing list archive)
State New
Headers show
Series [RFC] fs/exec: Add the support for ELF program's NUMA replication | expand

Commit Message

Huang Shijie Sept. 6, 2021, 4:16 p.m. UTC
This patch adds AT_NUMA_REPLICATION for execveat().

If this flag is set, the kernel will trigger COW(copy on write)
on the mmapped ELF binary. So the program will have a copied-page
on its NUMA node, even if the original page in page cache is
on other NUMA nodes.

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 fs/binfmt_elf.c            | 27 ++++++++++++++++++++++-----
 fs/exec.c                  |  5 ++++-
 include/linux/binfmts.h    |  1 +
 include/linux/mm.h         |  2 ++
 include/uapi/linux/fcntl.h |  2 ++
 mm/mprotect.c              |  2 +-
 6 files changed, 32 insertions(+), 7 deletions(-)

Comments

David Hildenbrand Sept. 6, 2021, 9:35 a.m. UTC | #1
On 06.09.21 18:16, Huang Shijie wrote:
> This patch adds AT_NUMA_REPLICATION for execveat().
> 
> If this flag is set, the kernel will trigger COW(copy on write)
> on the mmapped ELF binary. So the program will have a copied-page
> on its NUMA node, even if the original page in page cache is
> on other NUMA nodes.

Am I missing something important or is this just absolutely not what we 
want?

This means that for each and every invocation of the binary, we'll COW 
the complete binary -- an awful lot of wasted main memory and swap.

This is not a NUMA replication, this is en executable ELF code replication.

> 
> Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
> ---
>   fs/binfmt_elf.c            | 27 ++++++++++++++++++++++-----
>   fs/exec.c                  |  5 ++++-
>   include/linux/binfmts.h    |  1 +
>   include/linux/mm.h         |  2 ++
>   include/uapi/linux/fcntl.h |  2 ++
>   mm/mprotect.c              |  2 +-
>   6 files changed, 32 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 439ed81e755a..fac8f4a4555a 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
>   
>   static unsigned long elf_map(struct file *filep, unsigned long addr,
>   		const struct elf_phdr *eppnt, int prot, int type,
> -		unsigned long total_size)
> +		unsigned long total_size, int numa_replication)
>   {
>   	unsigned long map_addr;
>   	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
>   	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
>   	addr = ELF_PAGESTART(addr);
>   	size = ELF_PAGEALIGN(size);
> +	int ret;
>   
>   	/* mmap() will return -EINVAL if given a zero size, but a
>   	 * segment with zero filesize is perfectly valid */
> @@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
>   	*/
>   	if (total_size) {
>   		total_size = ELF_PAGEALIGN(total_size);
> -		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> +
> +		if (numa_replication) {
> +			/* Trigger the COW for this ELF code section */
> +			map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
> +						type | MAP_POPULATE, off);
> +			if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
> +				/* Change back */
> +				ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
> +				if (ret)
> +					return ret;
> +			}
> +		} else {
> +			map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> +		}
> +
>   		if (!BAD_ADDR(map_addr))
>   			vm_munmap(map_addr+size, total_size-size);
> -	} else
> +	} else {
>   		map_addr = vm_mmap(filep, addr, size, prot, type, off);
> +	}
>   
>   	if ((type & MAP_FIXED_NOREPLACE) &&
>   	    PTR_ERR((void *)map_addr) == -EEXIST)
> @@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>   				load_addr = -vaddr;
>   
>   			map_addr = elf_map(interpreter, load_addr + vaddr,
> -					eppnt, elf_prot, elf_type, total_size);
> +					eppnt, elf_prot, elf_type, total_size, 0);
>   			total_size = 0;
>   			error = map_addr;
>   			if (BAD_ADDR(map_addr))
> @@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
>   		}
>   
>   		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
> -				elf_prot, elf_flags, total_size);
> +				elf_prot, elf_flags, total_size,
> +				bprm->support_numa_replication);
>   		if (BAD_ADDR(error)) {
>   			retval = IS_ERR((void *)error) ?
>   				PTR_ERR((void*)error) : -EINVAL;
> diff --git a/fs/exec.c b/fs/exec.c
> index 38f63451b928..d27efa540641 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
>   		.lookup_flags = LOOKUP_FOLLOW,
>   	};
>   
> -	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
> +	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
>   		return ERR_PTR(-EINVAL);
>   	if (flags & AT_SYMLINK_NOFOLLOW)
>   		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
> @@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm,
>   	if (retval)
>   		goto out;
>   
> +	/* Do we support NUMA replication for this program? */
> +	bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
> +
>   	retval = exec_binprm(bprm);
>   	if (retval < 0)
>   		goto out;
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 049cf9421d83..1874e1732f20 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -64,6 +64,7 @@ struct linux_binprm {
>   	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
>   
>   	char buf[BINPRM_BUF_SIZE];
> +	int support_numa_replication;
>   } __randomize_layout;
>   
>   #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7ca22e6e694a..76611381be2a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
>   #endif
>   
>   extern int sysctl_nr_trim_pages;
> +int do_mprotect_pkey(unsigned long start, size_t len,
> +			unsigned long prot, int pkey);
>   
>   #ifdef CONFIG_PRINTK
>   void mem_dump_obj(void *object);
> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> index 2f86b2ad6d7e..de99c5ae8eca 100644
> --- a/include/uapi/linux/fcntl.h
> +++ b/include/uapi/linux/fcntl.h
> @@ -111,4 +111,6 @@
>   
>   #define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
>   
> +#define AT_NUMA_REPLICATION	0x10000	/* Support NUMA replication for the ELF program */
> +
>   #endif /* _UAPI_LINUX_FCNTL_H */
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 883e2cc85cad..d1f8cececfed 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
>   /*
>    * pkey==-1 when doing a legacy mprotect()
>    */
> -static int do_mprotect_pkey(unsigned long start, size_t len,
> +int do_mprotect_pkey(unsigned long start, size_t len,
>   		unsigned long prot, int pkey)
>   {
>   	unsigned long nstart, end, tmp, reqprot;
>
Matthew Wilcox Sept. 6, 2021, 12:09 p.m. UTC | #2
On Mon, Sep 06, 2021 at 04:16:13PM +0000, Huang Shijie wrote:
> This patch adds AT_NUMA_REPLICATION for execveat().
> 
> If this flag is set, the kernel will trigger COW(copy on write)
> on the mmapped ELF binary. So the program will have a copied-page
> on its NUMA node, even if the original page in page cache is
> on other NUMA nodes.

This does absolutely nothing for programs which run on multiple NUMA
nodes at the same time.

Also, I dislike the abuse of the term "COW" for this.  It's COF --
Copy On Fault.
David Hildenbrand Sept. 9, 2021, 8:58 a.m. UTC | #3
On 09.09.21 11:46, Huang Shijie wrote:
> On Mon, Sep 06, 2021 at 11:35:01AM +0200, David Hildenbrand wrote:
>> On 06.09.21 18:16, Huang Shijie wrote:
>>> This patch adds AT_NUMA_REPLICATION for execveat().
>>>
>>> If this flag is set, the kernel will trigger COW(copy on write)
>>> on the mmapped ELF binary. So the program will have a copied-page
>>> on its NUMA node, even if the original page in page cache is
>>> on other NUMA nodes.
>>
>> Am I missing something important or is this just absolutely not what we
>> want?
> 
> Please see the thread:
> https://marc.info/?l=linux-kernel&m=163070220429222&w=2
> 
> Linus did not think it is a good choice to implement the "per-numa node page cache"

That doesn't make this approach any better.

I don't think we want this in the kernel. If user space wants to waste 
memory, it can happily mmap() however it wants. The advisory is to not 
do it.
Huang Shijie Sept. 9, 2021, 9:46 a.m. UTC | #4
On Mon, Sep 06, 2021 at 11:35:01AM +0200, David Hildenbrand wrote:
> On 06.09.21 18:16, Huang Shijie wrote:
> > This patch adds AT_NUMA_REPLICATION for execveat().
> > 
> > If this flag is set, the kernel will trigger COW(copy on write)
> > on the mmapped ELF binary. So the program will have a copied-page
> > on its NUMA node, even if the original page in page cache is
> > on other NUMA nodes.
> 
> Am I missing something important or is this just absolutely not what we
> want?

Please see the thread:
https://marc.info/?l=linux-kernel&m=163070220429222&w=2

Linus did not think it is a good choice to implement the "per-numa node page cache"

> 
> This means that for each and every invocation of the binary, we'll COW the
> complete binary -- an awful lot of wasted main memory and swap.
Only the one who cares about the performance in NUMA will use this interface.
Most of the people never use it..


Thanks
Huang Shijie
Huang Shijie Sept. 9, 2021, 9:48 a.m. UTC | #5
On Mon, Sep 06, 2021 at 01:09:22PM +0100, Matthew Wilcox wrote:
> On Mon, Sep 06, 2021 at 04:16:13PM +0000, Huang Shijie wrote:
> > This patch adds AT_NUMA_REPLICATION for execveat().
> > 
> > If this flag is set, the kernel will trigger COW(copy on write)
> > on the mmapped ELF binary. So the program will have a copied-page
> > on its NUMA node, even if the original page in page cache is
> > on other NUMA nodes.
> 
> This does absolutely nothing for programs which run on multiple NUMA
> nodes at the same time.
Yes. Not suit for that case (Mysql database..).

> 
> Also, I dislike the abuse of the term "COW" for this.  It's COF --
> Copy On Fault.
Maybe it is better to add new flags "NUMA_REPLICATION" in mmap(). :)

Thanks
Huang Shijie
diff mbox series

Patch

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 439ed81e755a..fac8f4a4555a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -362,13 +362,14 @@  create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 
 static unsigned long elf_map(struct file *filep, unsigned long addr,
 		const struct elf_phdr *eppnt, int prot, int type,
-		unsigned long total_size)
+		unsigned long total_size, int numa_replication)
 {
 	unsigned long map_addr;
 	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
 	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
 	addr = ELF_PAGESTART(addr);
 	size = ELF_PAGEALIGN(size);
+	int ret;
 
 	/* mmap() will return -EINVAL if given a zero size, but a
 	 * segment with zero filesize is perfectly valid */
@@ -385,11 +386,26 @@  static unsigned long elf_map(struct file *filep, unsigned long addr,
 	*/
 	if (total_size) {
 		total_size = ELF_PAGEALIGN(total_size);
-		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+
+		if (numa_replication) {
+			/* Trigger the COW for this ELF code section */
+			map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
+						type | MAP_POPULATE, off);
+			if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
+				/* Change back */
+				ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
+				if (ret)
+					return ret;
+			}
+		} else {
+			map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+		}
+
 		if (!BAD_ADDR(map_addr))
 			vm_munmap(map_addr+size, total_size-size);
-	} else
+	} else {
 		map_addr = vm_mmap(filep, addr, size, prot, type, off);
+	}
 
 	if ((type & MAP_FIXED_NOREPLACE) &&
 	    PTR_ERR((void *)map_addr) == -EEXIST)
@@ -635,7 +651,7 @@  static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				load_addr = -vaddr;
 
 			map_addr = elf_map(interpreter, load_addr + vaddr,
-					eppnt, elf_prot, elf_type, total_size);
+					eppnt, elf_prot, elf_type, total_size, 0);
 			total_size = 0;
 			error = map_addr;
 			if (BAD_ADDR(map_addr))
@@ -1139,7 +1155,8 @@  static int load_elf_binary(struct linux_binprm *bprm)
 		}
 
 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-				elf_prot, elf_flags, total_size);
+				elf_prot, elf_flags, total_size,
+				bprm->support_numa_replication);
 		if (BAD_ADDR(error)) {
 			retval = IS_ERR((void *)error) ?
 				PTR_ERR((void*)error) : -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 38f63451b928..d27efa540641 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -900,7 +900,7 @@  static struct file *do_open_execat(int fd, struct filename *name, int flags)
 		.lookup_flags = LOOKUP_FOLLOW,
 	};
 
-	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
 		return ERR_PTR(-EINVAL);
 	if (flags & AT_SYMLINK_NOFOLLOW)
 		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
@@ -1828,6 +1828,9 @@  static int bprm_execve(struct linux_binprm *bprm,
 	if (retval)
 		goto out;
 
+	/* Do we support NUMA replication for this program? */
+	bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
+
 	retval = exec_binprm(bprm);
 	if (retval < 0)
 		goto out;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 049cf9421d83..1874e1732f20 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -64,6 +64,7 @@  struct linux_binprm {
 	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
 
 	char buf[BINPRM_BUF_SIZE];
+	int support_numa_replication;
 } __randomize_layout;
 
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..76611381be2a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3244,6 +3244,8 @@  unsigned long wp_shared_mapping_range(struct address_space *mapping,
 #endif
 
 extern int sysctl_nr_trim_pages;
+int do_mprotect_pkey(unsigned long start, size_t len,
+			unsigned long prot, int pkey);
 
 #ifdef CONFIG_PRINTK
 void mem_dump_obj(void *object);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..de99c5ae8eca 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -111,4 +111,6 @@ 
 
 #define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
 
+#define AT_NUMA_REPLICATION	0x10000	/* Support NUMA replication for the ELF program */
+
 #endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 883e2cc85cad..d1f8cececfed 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -519,7 +519,7 @@  mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 /*
  * pkey==-1 when doing a legacy mprotect()
  */
-static int do_mprotect_pkey(unsigned long start, size_t len,
+int do_mprotect_pkey(unsigned long start, size_t len,
 		unsigned long prot, int pkey)
 {
 	unsigned long nstart, end, tmp, reqprot;