diff mbox series

cgroup/cpuset: Add a new isolated mems.policy type.

Message ID 20220904040241.1708-1-hezhongkun.hzk@bytedance.com (mailing list archive)
State New
Headers show
Series cgroup/cpuset: Add a new isolated mems.policy type. | expand

Commit Message

Zhongkun He Sept. 4, 2022, 4:02 a.m. UTC
From: Zhongkun He <hezhongkun.hzk@bytedance.com>

Mempolicy is difficult to use because it is set in-process
via a system call. We want to make it easier to use mempolicy
in cpuset, and  we can control low-priority cgroups to
allocate memory in specified nodes. So this patch want to
adds the mempolicy interface in cpuset.

The mempolicy priority of cpuset is lower than the task.
The order of getting the policy is:
	1) vma mempolicy
	2) task->mempolicy
	3) cpuset->mempolicy
	4) default policy.

cpuset's policy is owned by itself, but descendants will
get the default mempolicy from parent.

How to use the mempolicy interface:
	echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy
	echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy
        echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy
Show the policy:
	cat /sys/fs/cgroup/zz/cpuset.mems.policy
		prefer:2
	cat /sys/fs/cgroup/zz/cpuset.mems.policy
		bind:1-3
	cat /sys/fs/cgroup/zz/cpuset.mems.policy
		interleave:0-3
Clear the policy:
	echo default > /sys/fs/cgroup/zz/cpuset.mems.policy

Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
---
 include/linux/mempolicy.h |   9 ++
 include/linux/sched.h     |   2 +
 kernel/cgroup/cpuset.c    | 182 +++++++++++++++++++++++++++++++++++++-
 kernel/fork.c             |   1 +
 mm/mempolicy.c            |  25 ++++--
 5 files changed, 209 insertions(+), 10 deletions(-)


base-commit: c40e8341e3b3bb27e3a65b06b5b454626234c4f0

Comments

kernel test robot Sept. 4, 2022, 6:04 a.m. UTC | #1
Hi hezhongkun,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on c40e8341e3b3bb27e3a65b06b5b454626234c4f0]

url:    https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
base:   c40e8341e3b3bb27e3a65b06b5b454626234c4f0
config: x86_64-randconfig-a014
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
        git checkout aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash kernel/cgroup/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> kernel/cgroup/cpuset.c:2515:13: warning: unused variable 'cs_allowed' [-Wunused-variable]
           nodemask_t cs_allowed;
                      ^
   1 warning generated.


vim +/cs_allowed +2515 kernel/cgroup/cpuset.c

  2508	
  2509	/* change cpuset mempolicy */
  2510	static ssize_t cpuset_mpol_write(struct kernfs_open_file *of,
  2511			char *buf, size_t nbytes, loff_t off)
  2512	{
  2513		struct mempolicy *mpol, *old = NULL;
  2514		struct cpuset *cs = css_cs(of_css(of));
> 2515		nodemask_t cs_allowed;
  2516		int err = -ENODEV;
  2517	
  2518		css_get(&cs->css);
  2519		kernfs_break_active_protection(of->kn);
  2520		percpu_down_write(&cpuset_rwsem);
  2521	
  2522		if (!is_cpuset_online(cs))
  2523			goto out_unlock;
  2524	
  2525		buf = strstrip(buf);
  2526		err = mpol_parse_str(buf, &mpol);
  2527	
  2528		if (err) {
  2529			err = -EINVAL;
  2530			goto out_unlock;
  2531		}
  2532	
  2533		spin_lock_irq(&callback_lock);
  2534		old = cs->mempolicy;
  2535		update_cs_mpol_nodemask(cs, mpol);
  2536		cs->mempolicy = mpol;
  2537		spin_unlock_irq(&callback_lock);
  2538	
  2539		update_tasks_cs_mpol(cs);
  2540	
  2541	out_unlock:
  2542		percpu_up_write(&cpuset_rwsem);
  2543		kernfs_unbreak_active_protection(of->kn);
  2544		css_put(&cs->css);
  2545	
  2546		if (old) {
  2547		/*Wait for outstanding programs to complete.*/
  2548			synchronize_rcu();
  2549			mpol_put(old);
  2550		}
  2551		return err ?: nbytes;
  2552	}
  2553
kernel test robot Sept. 4, 2022, 6:20 a.m. UTC | #2
Hi hezhongkun,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on c40e8341e3b3bb27e3a65b06b5b454626234c4f0]

url:    https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
base:   c40e8341e3b3bb27e3a65b06b5b454626234c4f0
config: x86_64-defconfig
compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
        git checkout aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash kernel/cgroup/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   kernel/cgroup/cpuset.c: In function 'cpuset_mpol_write':
>> kernel/cgroup/cpuset.c:2515:20: warning: unused variable 'cs_allowed' [-Wunused-variable]
    2515 |         nodemask_t cs_allowed;
         |                    ^~~~~~~~~~


vim +/cs_allowed +2515 kernel/cgroup/cpuset.c

  2508	
  2509	/* change cpuset mempolicy */
  2510	static ssize_t cpuset_mpol_write(struct kernfs_open_file *of,
  2511			char *buf, size_t nbytes, loff_t off)
  2512	{
  2513		struct mempolicy *mpol, *old = NULL;
  2514		struct cpuset *cs = css_cs(of_css(of));
> 2515		nodemask_t cs_allowed;
  2516		int err = -ENODEV;
  2517	
  2518		css_get(&cs->css);
  2519		kernfs_break_active_protection(of->kn);
  2520		percpu_down_write(&cpuset_rwsem);
  2521	
  2522		if (!is_cpuset_online(cs))
  2523			goto out_unlock;
  2524	
  2525		buf = strstrip(buf);
  2526		err = mpol_parse_str(buf, &mpol);
  2527	
  2528		if (err) {
  2529			err = -EINVAL;
  2530			goto out_unlock;
  2531		}
  2532	
  2533		spin_lock_irq(&callback_lock);
  2534		old = cs->mempolicy;
  2535		update_cs_mpol_nodemask(cs, mpol);
  2536		cs->mempolicy = mpol;
  2537		spin_unlock_irq(&callback_lock);
  2538	
  2539		update_tasks_cs_mpol(cs);
  2540	
  2541	out_unlock:
  2542		percpu_up_write(&cpuset_rwsem);
  2543		kernfs_unbreak_active_protection(of->kn);
  2544		css_put(&cs->css);
  2545	
  2546		if (old) {
  2547		/*Wait for outstanding programs to complete.*/
  2548			synchronize_rcu();
  2549			mpol_put(old);
  2550		}
  2551		return err ?: nbytes;
  2552	}
  2553
kernel test robot Sept. 4, 2022, 6:41 a.m. UTC | #3
Hi hezhongkun,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on c40e8341e3b3bb27e3a65b06b5b454626234c4f0]

url:    https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
base:   c40e8341e3b3bb27e3a65b06b5b454626234c4f0
config: s390-randconfig-r044-20220904
compiler: s390-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
        git checkout aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> mm/mempolicy.c:2980:5: warning: no previous prototype for 'mpol_parse_str' [-Wmissing-prototypes]
    2980 | int mpol_parse_str(char *str, struct mempolicy **mpol)
         |     ^~~~~~~~~~~~~~


vim +/mpol_parse_str +2980 mm/mempolicy.c

1a75a6c825c172 Christoph Lameter      2006-01-08  2968  
aafa4e2c9e5c3c Zhongkun He            2022-09-04  2969  #if defined(CONFIG_TMPFS) || defined(CONFIG_NUMA)
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2970  /**
f2a07f40dbc603 Hugh Dickins           2013-01-02  2971   * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2972   * @str:  string containing mempolicy to parse
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  2973   * @mpol:  pointer to struct mempolicy pointer, returned on success.
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2974   *
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2975   * Format of input:
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2976   *	<mode>[=<flags>][:<nodelist>]
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2977   *
dad5b023294981 Randy Dunlap           2022-01-14  2978   * Return: %0 on success, else %1
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2979   */
a7a88b23737095 Hugh Dickins           2013-01-02 @2980  int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2981  {
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  2982  	struct mempolicy *new = NULL;
f2a07f40dbc603 Hugh Dickins           2013-01-02  2983  	unsigned short mode_flags;
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  2984  	nodemask_t nodes;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2985  	char *nodelist = strchr(str, ':');
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2986  	char *flags = strchr(str, '=');
dedf2c73b80b45 zhong jiang            2018-10-26  2987  	int err = 1, mode;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2988  
c7a91bc7c2e17e Dan Carpenter          2020-01-30  2989  	if (flags)
c7a91bc7c2e17e Dan Carpenter          2020-01-30  2990  		*flags++ = '\0';	/* terminate mode string */
c7a91bc7c2e17e Dan Carpenter          2020-01-30  2991  
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2992  	if (nodelist) {
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2993  		/* NUL-terminate mode or flags string */
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2994  		*nodelist++ = '\0';
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  2995  		if (nodelist_parse(nodelist, nodes))
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2996  			goto out;
01f13bd607346a Lai Jiangshan          2012-12-12  2997  		if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  2998  			goto out;
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  2999  	} else
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3000  		nodes_clear(nodes);
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3001  
dedf2c73b80b45 zhong jiang            2018-10-26  3002  	mode = match_string(policy_modes, MPOL_MAX, str);
dedf2c73b80b45 zhong jiang            2018-10-26  3003  	if (mode < 0)
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3004  		goto out;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3005  
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3006  	switch (mode) {
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3007  	case MPOL_PREFERRED:
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3008  		/*
aa9f7d5172fac9 Randy Dunlap           2020-04-01  3009  		 * Insist on a nodelist of one node only, although later
aa9f7d5172fac9 Randy Dunlap           2020-04-01  3010  		 * we use first_node(nodes) to grab a single node, so here
aa9f7d5172fac9 Randy Dunlap           2020-04-01  3011  		 * nodelist (or nodes) cannot be empty.
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3012  		 */
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3013  		if (nodelist) {
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3014  			char *rest = nodelist;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3015  			while (isdigit(*rest))
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3016  				rest++;
926f2ae04f1830 KOSAKI Motohiro        2010-03-23  3017  			if (*rest)
926f2ae04f1830 KOSAKI Motohiro        2010-03-23  3018  				goto out;
aa9f7d5172fac9 Randy Dunlap           2020-04-01  3019  			if (nodes_empty(nodes))
aa9f7d5172fac9 Randy Dunlap           2020-04-01  3020  				goto out;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3021  		}
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3022  		break;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3023  	case MPOL_INTERLEAVE:
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3024  		/*
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3025  		 * Default to online nodes with memory if no nodelist
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3026  		 */
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3027  		if (!nodelist)
01f13bd607346a Lai Jiangshan          2012-12-12  3028  			nodes = node_states[N_MEMORY];
3f226aa1cbc006 Lee Schermerhorn       2008-04-28  3029  		break;
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3030  	case MPOL_LOCAL:
3f226aa1cbc006 Lee Schermerhorn       2008-04-28  3031  		/*
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3032  		 * Don't allow a nodelist;  mpol_new() checks flags
3f226aa1cbc006 Lee Schermerhorn       2008-04-28  3033  		 */
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3034  		if (nodelist)
3f226aa1cbc006 Lee Schermerhorn       2008-04-28  3035  			goto out;
3f226aa1cbc006 Lee Schermerhorn       2008-04-28  3036  		break;
413b43deab8377 Ravikiran G Thirumalai 2010-03-23  3037  	case MPOL_DEFAULT:
413b43deab8377 Ravikiran G Thirumalai 2010-03-23  3038  		/*
413b43deab8377 Ravikiran G Thirumalai 2010-03-23  3039  		 * Insist on a empty nodelist
413b43deab8377 Ravikiran G Thirumalai 2010-03-23  3040  		 */
413b43deab8377 Ravikiran G Thirumalai 2010-03-23  3041  		if (!nodelist)
413b43deab8377 Ravikiran G Thirumalai 2010-03-23  3042  			err = 0;
413b43deab8377 Ravikiran G Thirumalai 2010-03-23  3043  		goto out;
b27abaccf8e8b0 Dave Hansen            2021-09-02  3044  	case MPOL_PREFERRED_MANY:
d69b2e63e9172a KOSAKI Motohiro        2010-03-23  3045  	case MPOL_BIND:
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3046  		/*
d69b2e63e9172a KOSAKI Motohiro        2010-03-23  3047  		 * Insist on a nodelist
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3048  		 */
d69b2e63e9172a KOSAKI Motohiro        2010-03-23  3049  		if (!nodelist)
d69b2e63e9172a KOSAKI Motohiro        2010-03-23  3050  			goto out;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3051  	}
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3052  
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3053  	mode_flags = 0;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3054  	if (flags) {
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3055  		/*
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3056  		 * Currently, we only support two mutually exclusive
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3057  		 * mode flags.
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3058  		 */
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3059  		if (!strcmp(flags, "static"))
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3060  			mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3061  		else if (!strcmp(flags, "relative"))
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3062  			mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3063  		else
926f2ae04f1830 KOSAKI Motohiro        2010-03-23  3064  			goto out;
aafa4e2c9e5c3c Zhongkun He            2022-09-04  3065  	} else {
aafa4e2c9e5c3c Zhongkun He            2022-09-04  3066  		/*use static mode_flags in default*/
aafa4e2c9e5c3c Zhongkun He            2022-09-04  3067  		mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3068  	}
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3069  
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3070  	new = mpol_new(mode, mode_flags, &nodes);
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3071  	if (IS_ERR(new))
926f2ae04f1830 KOSAKI Motohiro        2010-03-23  3072  		goto out;
926f2ae04f1830 KOSAKI Motohiro        2010-03-23  3073  
f2a07f40dbc603 Hugh Dickins           2013-01-02  3074  	/*
f2a07f40dbc603 Hugh Dickins           2013-01-02  3075  	 * Save nodes for mpol_to_str() to show the tmpfs mount options
f2a07f40dbc603 Hugh Dickins           2013-01-02  3076  	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
f2a07f40dbc603 Hugh Dickins           2013-01-02  3077  	 */
269fbe72cded0a Ben Widawsky           2021-06-30  3078  	if (mode != MPOL_PREFERRED) {
269fbe72cded0a Ben Widawsky           2021-06-30  3079  		new->nodes = nodes;
269fbe72cded0a Ben Widawsky           2021-06-30  3080  	} else if (nodelist) {
269fbe72cded0a Ben Widawsky           2021-06-30  3081  		nodes_clear(new->nodes);
269fbe72cded0a Ben Widawsky           2021-06-30  3082  		node_set(first_node(nodes), new->nodes);
269fbe72cded0a Ben Widawsky           2021-06-30  3083  	} else {
7858d7bca7fbbb Feng Tang              2021-06-30  3084  		new->mode = MPOL_LOCAL;
269fbe72cded0a Ben Widawsky           2021-06-30  3085  	}
f2a07f40dbc603 Hugh Dickins           2013-01-02  3086  
f2a07f40dbc603 Hugh Dickins           2013-01-02  3087  	/*
f2a07f40dbc603 Hugh Dickins           2013-01-02  3088  	 * Save nodes for contextualization: this will be used to "clone"
f2a07f40dbc603 Hugh Dickins           2013-01-02  3089  	 * the mempolicy in a specific context [cpuset] at a later time.
f2a07f40dbc603 Hugh Dickins           2013-01-02  3090  	 */
e17f74af351cce Lee Schermerhorn       2010-05-24  3091  	new->w.user_nodemask = nodes;
f2a07f40dbc603 Hugh Dickins           2013-01-02  3092  
926f2ae04f1830 KOSAKI Motohiro        2010-03-23  3093  	err = 0;
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3094  
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3095  out:
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3096  	/* Restore string for error message */
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3097  	if (nodelist)
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3098  		*--nodelist = ':';
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3099  	if (flags)
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3100  		*--flags = '=';
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3101  	if (!err)
71fe804b6d56d6 Lee Schermerhorn       2008-04-28  3102  		*mpol = new;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3103  	return err;
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3104  }
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3105  #endif /* CONFIG_TMPFS */
095f1fc4ebf36c Lee Schermerhorn       2008-04-28  3106
kernel test robot Sept. 4, 2022, 11:08 p.m. UTC | #4
Hi hezhongkun,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on c40e8341e3b3bb27e3a65b06b5b454626234c4f0]

url:    https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
base:   c40e8341e3b3bb27e3a65b06b5b454626234c4f0
config: ia64-randconfig-c44-20220904
compiler: ia64-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
        git checkout aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=ia64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   kernel/cgroup/cpuset.c: In function 'cpuset_mpol_write':
>> kernel/cgroup/cpuset.c:2526:15: error: implicit declaration of function 'mpol_parse_str'; did you mean 'mpol_to_str'? [-Werror=implicit-function-declaration]
    2526 |         err = mpol_parse_str(buf, &mpol);
         |               ^~~~~~~~~~~~~~
         |               mpol_to_str
   kernel/cgroup/cpuset.c:2515:20: warning: unused variable 'cs_allowed' [-Wunused-variable]
    2515 |         nodemask_t cs_allowed;
         |                    ^~~~~~~~~~
   cc1: some warnings being treated as errors


vim +2526 kernel/cgroup/cpuset.c

  2508	
  2509	/* change cpuset mempolicy */
  2510	static ssize_t cpuset_mpol_write(struct kernfs_open_file *of,
  2511			char *buf, size_t nbytes, loff_t off)
  2512	{
  2513		struct mempolicy *mpol, *old = NULL;
  2514		struct cpuset *cs = css_cs(of_css(of));
  2515		nodemask_t cs_allowed;
  2516		int err = -ENODEV;
  2517	
  2518		css_get(&cs->css);
  2519		kernfs_break_active_protection(of->kn);
  2520		percpu_down_write(&cpuset_rwsem);
  2521	
  2522		if (!is_cpuset_online(cs))
  2523			goto out_unlock;
  2524	
  2525		buf = strstrip(buf);
> 2526		err = mpol_parse_str(buf, &mpol);
  2527	
  2528		if (err) {
  2529			err = -EINVAL;
  2530			goto out_unlock;
  2531		}
  2532	
  2533		spin_lock_irq(&callback_lock);
  2534		old = cs->mempolicy;
  2535		update_cs_mpol_nodemask(cs, mpol);
  2536		cs->mempolicy = mpol;
  2537		spin_unlock_irq(&callback_lock);
  2538	
  2539		update_tasks_cs_mpol(cs);
  2540	
  2541	out_unlock:
  2542		percpu_up_write(&cpuset_rwsem);
  2543		kernfs_unbreak_active_protection(of->kn);
  2544		css_put(&cs->css);
  2545	
  2546		if (old) {
  2547		/*Wait for outstanding programs to complete.*/
  2548			synchronize_rcu();
  2549			mpol_put(old);
  2550		}
  2551		return err ?: nbytes;
  2552	}
  2553
Michal Hocko Sept. 5, 2022, 6:45 a.m. UTC | #5
On Sun 04-09-22 12:02:41, hezhongkun wrote:
> From: Zhongkun He <hezhongkun.hzk@bytedance.com>
> 
> Mempolicy is difficult to use because it is set in-process
> via a system call. We want to make it easier to use mempolicy
> in cpuset, and  we can control low-priority cgroups to
> allocate memory in specified nodes. So this patch want to
> adds the mempolicy interface in cpuset.
> 
> The mempolicy priority of cpuset is lower than the task.
> The order of getting the policy is:
> 	1) vma mempolicy
> 	2) task->mempolicy
> 	3) cpuset->mempolicy
> 	4) default policy.
> 
> cpuset's policy is owned by itself, but descendants will
> get the default mempolicy from parent.

What is the hierarchical behavior of the policy? Say parent has a
stronger requirement (say bind) than a child (prefer)?
 
> How to use the mempolicy interface:
> 	echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy
> 	echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy
>         echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy

Am I just confused or did you really mean to combine all these
together?
Zhongkun He Sept. 5, 2022, 10:30 a.m. UTC | #6
Hi Michal, thanks for your reply.

The current 'mempolicy' is hierarchically independent. The default value 
of the child is to inherit from the parent. The modification of the 
child policy will not be restricted by the parent.

Of course, there are other options, such as the child's policy mode must 
be the same as the parent's. node can be the subset of parent's, but the 
interleave type will be complicated, that's why hierarchy independence 
is used. It would be better if you have other suggestions?

Thanks.

> On Sun 04-09-22 12:02:41, hezhongkun wrote:
>> From: Zhongkun He <hezhongkun.hzk@bytedance.com>
>>
>> Mempolicy is difficult to use because it is set in-process
>> via a system call. We want to make it easier to use mempolicy
>> in cpuset, and  we can control low-priority cgroups to
>> allocate memory in specified nodes. So this patch want to
>> adds the mempolicy interface in cpuset.
>>
>> The mempolicy priority of cpuset is lower than the task.
>> The order of getting the policy is:
>> 	1) vma mempolicy
>> 	2) task->mempolicy
>> 	3) cpuset->mempolicy
>> 	4) default policy.
>>
>> cpuset's policy is owned by itself, but descendants will
>> get the default mempolicy from parent.
> 
> What is the hierarchical behavior of the policy? Say parent has a
> stronger requirement (say bind) than a child (prefer)?
>   
>> How to use the mempolicy interface:
>> 	echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy
>> 	echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy
>>          echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy
> 
> Am I just confused or did you really mean to combine all these
> together?
Michal Hocko Sept. 5, 2022, 10:50 a.m. UTC | #7
On Mon 05-09-22 18:30:55, Zhongkun He wrote:
> Hi Michal, thanks for your reply.
> 
> The current 'mempolicy' is hierarchically independent. The default value of
> the child is to inherit from the parent. The modification of the child
> policy will not be restricted by the parent.

This breaks cgroup fundamental property of hierarchical enforcement of
each property. And as such it is a no go.

> Of course, there are other options, such as the child's policy mode must be
> the same as the parent's. node can be the subset of parent's, but the
> interleave type will be complicated, that's why hierarchy independence is
> used. It would be better if you have other suggestions?

Honestly, I am not really sure cgroup cpusets is a great fit for this
usecase. It would be probably better to elaborate some more what are the
existing shortcomings and what you would like to achieve. Just stating
the syscall is a hard to use interface is not quite clear on its own.

Btw. have you noticed this question?

> > What is the hierarchical behavior of the policy? Say parent has a
> > stronger requirement (say bind) than a child (prefer)?
> > > How to use the mempolicy interface:
> > > 	echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy
> > > 	echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy
> > >          echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy
> > 
> > Am I just confused or did you really mean to combine all these
> > together?
Zhongkun He Sept. 6, 2022, 10:37 a.m. UTC | #8
> On Mon 05-09-22 18:30:55, Zhongkun He wrote:
>> Hi Michal, thanks for your reply.
>>
>> The current 'mempolicy' is hierarchically independent. The default value of
>> the child is to inherit from the parent. The modification of the child
>> policy will not be restricted by the parent.
> 
> This breaks cgroup fundamental property of hierarchical enforcement of
> each property. And as such it is a no go.
> 
>> Of course, there are other options, such as the child's policy mode must be
>> the same as the parent's. node can be the subset of parent's, but the
>> interleave type will be complicated, that's why hierarchy independence is
>> used. It would be better if you have other suggestions?
> 
> Honestly, I am not really sure cgroup cpusets is a great fit for this
> usecase. It would be probably better to elaborate some more what are the
> existing shortcomings and what you would like to achieve. Just stating
> the syscall is a hard to use interface is not quite clear on its own.
> 
> Btw. have you noticed this question?
> 
>>> What is the hierarchical behavior of the policy? Say parent has a
>>> stronger requirement (say bind) than a child (prefer)?
>>>> How to use the mempolicy interface:
>>>> 	echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy
>>>> 	echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy
>>>>           echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy
>>>
>>> Am I just confused or did you really mean to combine all these
>>> together?
>

Hi Michal, thanks for your reply.

 >>Say parent has a stronger requirement (say bind) than a child(prefer)?

Yes, combine all these together. The parent's task will use 'bind', 
child's use 'prefer'.This is the current implementation, and we can 
discuss and modify it together if there are other suggestions.

1:Existing shortcomings

In our use case, the application and the control plane are two separate 
systems. When the application is created, it doesn't know how to use 
memory, and it doesn't care. The control plane will decide the memory 
usage policy based on different reasons (the attributes of the 
application itself, the priority, the remaining resources of the 
system). Currently, numactl is used to set it at program startup, and 
the child process will inherit the mempolicy. But we can't dynamically 
adjust the memory policy, except restart, the memory policy will not change.

2:Our goals

For the above reasons, we want to create a mempolicy at the cgroup 
level. Usually processes under a cgroup have the same priority and 
attributes, and we can dynamically adjust the memory allocation strategy 
according to the remaining resources of the system. For example, a 
low-priority cgroup uses the 'bind:2-3' policy, and a high-priority 
cgroup uses bind:0-1. When resources are insufficient, it will be 
changed to bind:3, bind:0-2 by control plane, etc.Further more, more 
mempolicy can be extended, such as allocating memory according to node 
weight, etc.

Thanks.
Michal Hocko Sept. 6, 2022, 12:33 p.m. UTC | #9
On Tue 06-09-22 18:37:40, Zhongkun He wrote:
> > On Mon 05-09-22 18:30:55, Zhongkun He wrote:
> > > Hi Michal, thanks for your reply.
> > > 
> > > The current 'mempolicy' is hierarchically independent. The default value of
> > > the child is to inherit from the parent. The modification of the child
> > > policy will not be restricted by the parent.
> > 
> > This breaks cgroup fundamental property of hierarchical enforcement of
> > each property. And as such it is a no go.
> > 
> > > Of course, there are other options, such as the child's policy mode must be
> > > the same as the parent's. node can be the subset of parent's, but the
> > > interleave type will be complicated, that's why hierarchy independence is
> > > used. It would be better if you have other suggestions?
> > 
> > Honestly, I am not really sure cgroup cpusets is a great fit for this
> > usecase. It would be probably better to elaborate some more what are the
> > existing shortcomings and what you would like to achieve. Just stating
> > the syscall is a hard to use interface is not quite clear on its own.
> > 
> > Btw. have you noticed this question?
> > 
> > > > What is the hierarchical behavior of the policy? Say parent has a
> > > > stronger requirement (say bind) than a child (prefer)?
> > > > > How to use the mempolicy interface:
> > > > > 	echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy
> > > > > 	echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy
> > > > >           echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy
> > > > 
> > > > Am I just confused or did you really mean to combine all these
> > > > together?
> > 
> 
> Hi Michal, thanks for your reply.
> 
> >>Say parent has a stronger requirement (say bind) than a child(prefer)?
> 
> Yes, combine all these together.

What is the semantic of the resulting policy?

> The parent's task will use 'bind', child's
> use 'prefer'.This is the current implementation, and we can discuss and
> modify it together if there are other suggestions.
>
> 1:Existing shortcomings
> 
> In our use case, the application and the control plane are two separate
> systems. When the application is created, it doesn't know how to use memory,
> and it doesn't care. The control plane will decide the memory usage policy
> based on different reasons (the attributes of the application itself, the
> priority, the remaining resources of the system). Currently, numactl is used
> to set it at program startup, and the child process will inherit the
> mempolicy.

Yes this is common practice I have seen so far.

> But we can't dynamically adjust the memory policy, except
> restart, the memory policy will not change.

Do you really need to change the policy itself or only the effective
nodemask? I mean what is your usecase to go from say mbind to preferred
policy?  Do you need any other policy than bind and preferred?
 
> 2:Our goals
> 
> For the above reasons, we want to create a mempolicy at the cgroup level.
> Usually processes under a cgroup have the same priority and attributes, and
> we can dynamically adjust the memory allocation strategy according to the
> remaining resources of the system. For example, a low-priority cgroup uses
> the 'bind:2-3' policy, and a high-priority cgroup uses bind:0-1. When
> resources are insufficient, it will be changed to bind:3, bind:0-2 by
> control plane, etc.Further more, more mempolicy can be extended, such as
> allocating memory according to node weight, etc.

Yes, I do understand that you want to change the node affinity and that
is already possible with cpuset cgroup. The existing constrain is that
the policy is hardcoded mbind IIRC. So you cannot really implement a dynamic
preferred policy which would make some sense to me. The question is how
to implement that with a sensible semantic. It is hard to partition the
system into several cgroups if subset allows to spill over to others.
Say something like the following
	root (nodes=0-3)
       /    \
A (0, 1)     B (2, 3)

if both are MBIND then this makes sense because they are kinda isolated
(at least for user allocations) but if B is PREFERRED and therefore
allowed to use nodes 0 and 1 then it can deplete the memory from A and
therefore isolation doesn't work at all.

I can imagine that the all cgroups would use PREFERRED policy and then
nobody can expect anything and the configuration is mostly best effort.
But it feels like this is an abuse of the cgroup interface and a proper
syscall interface is likely due. Would it make more sense to add
pidfd_set_mempolicy and allow sufficiently privileged process to
manipulate default memory policy of a remote process?
Zhongkun He Sept. 7, 2022, 1:50 p.m. UTC | #10
>> Hi Michal, thanks for your reply.
>>
>>>> Say parent has a stronger requirement (say bind) than a child(prefer)?
>>
>> Yes, combine all these together.
> 
> What is the semantic of the resulting policy?
> 
>> The parent's task will use 'bind', child's
>> use 'prefer'.This is the current implementation, and we can discuss and
>> modify it together if there are other suggestions.
>>
>> 1:Existing shortcomings
>>
>> In our use case, the application and the control plane are two separate
>> systems. When the application is created, it doesn't know how to use memory,
>> and it doesn't care. The control plane will decide the memory usage policy
>> based on different reasons (the attributes of the application itself, the
>> priority, the remaining resources of the system). Currently, numactl is used
>> to set it at program startup, and the child process will inherit the
>> mempolicy.
> 
> Yes this is common practice I have seen so far.
> 
>> But we can't dynamically adjust the memory policy, except
>> restart, the memory policy will not change.
> 
> Do you really need to change the policy itself or only the effective
> nodemask? I mean what is your usecase to go from say mbind to preferred
> policy?  Do you need any other policy than bind and preferred?
>   
>> 2:Our goals
>>
>> For the above reasons, we want to create a mempolicy at the cgroup level.
>> Usually processes under a cgroup have the same priority and attributes, and
>> we can dynamically adjust the memory allocation strategy according to the
>> remaining resources of the system. For example, a low-priority cgroup uses
>> the 'bind:2-3' policy, and a high-priority cgroup uses bind:0-1. When
>> resources are insufficient, it will be changed to bind:3, bind:0-2 by
>> control plane, etc.Further more, more mempolicy can be extended, such as
>> allocating memory according to node weight, etc.
> 
> Yes, I do understand that you want to change the node affinity and that
> is already possible with cpuset cgroup. The existing constrain is that
> the policy is hardcoded mbind IIRC. So you cannot really implement a dynamic
> preferred policy which would make some sense to me. The question is how
> to implement that with a sensible semantic. It is hard to partition the
> system into several cgroups if subset allows to spill over to others.
> Say something like the following
> 	root (nodes=0-3)
>         /    \
> A (0, 1)     B (2, 3)
> 
> if both are MBIND then this makes sense because they are kinda isolated
> (at least for user allocations) but if B is PREFERRED and therefore
> allowed to use nodes 0 and 1 then it can deplete the memory from A and
> therefore isolation doesn't work at all.
> 
> I can imagine that the all cgroups would use PREFERRED policy and then
> nobody can expect anything and the configuration is mostly best effort.
> But it feels like this is an abuse of the cgroup interface and a proper
> syscall interface is likely due. Would it make more sense to add
> pidfd_set_mempolicy and allow sufficiently privileged process to
> manipulate default memory policy of a remote process?

Hi Michal, thanks for your reply.

 > Do you really need to change the policy itself or only the effective
 > nodemask? Do you need any other policy than bind and preferred?

Yes, we need to change the policy, not only his nodemask. we really want 
policy is interleave, and extend it to weight-interleave.
Say something like the following
			node       weight
     interleave:		 0-3       1:1:1:1  default one by one
     weight-interleave:   0-3       1:2:4:6  alloc pages by weight
					    (User set weight.)
In the actual usecase, the remaining resources of each node are 
different, and the use of interleave cannot maximize the use of resources.

Back to the previous question.
 >The question is how to implement that with a sensible semantic.

Thanks for your analysis and suggestions.It is really difficult to add 
policy directly to cgroup for the hierarchical enforcement. It would be 
a good idea to add pidfd_set_mempolicy.

Also, there is a new idea.
We can try to separate the elements of mempolicy and use them independently.
Mempolicy has two meanings:
     nodes:which nodes to use(nodes,0-3), we can use cpuset's 
effective_mems directly.
     mode:how to use them(bind,prefer,etc). change the mode to a 
cpuset->flags,such as CS_INTERLEAVE。
task_struct->mems_allowed is equal to cpuset->effective_mems,which is 
hierarchical enforcement。CS_INTERLEAVE can also be updated into tasks, 
just like other flags(CS_SPREAD_PAGE).
When a process needs to allocate memory, it can find the appropriate 
node to allocate pages according to the flag and mems_allowed.

thanks.
Michal Hocko Sept. 8, 2022, 7:19 a.m. UTC | #11
On Wed 07-09-22 21:50:24, Zhongkun He wrote:
[...]
> > Do you really need to change the policy itself or only the effective
> > nodemask? Do you need any other policy than bind and preferred?
> 
> Yes, we need to change the policy, not only his nodemask. we really want
> policy is interleave, and extend it to weight-interleave.
> Say something like the following
> 			node       weight
>     interleave:		 0-3       1:1:1:1  default one by one
>     weight-interleave:   0-3       1:2:4:6  alloc pages by weight
> 					    (User set weight.)
> In the actual usecase, the remaining resources of each node are different,
> and the use of interleave cannot maximize the use of resources.

OK, this seems a separate topic. It would be good to start by proposing
that new policy in isolation with the semantic description.

> Back to the previous question.
> >The question is how to implement that with a sensible semantic.
> 
> Thanks for your analysis and suggestions.It is really difficult to add
> policy directly to cgroup for the hierarchical enforcement. It would be a
> good idea to add pidfd_set_mempolicy.

Are you going to pursue that path?
 
> Also, there is a new idea.
> We can try to separate the elements of mempolicy and use them independently.
> Mempolicy has two meanings:
>     nodes:which nodes to use(nodes,0-3), we can use cpuset's effective_mems
> directly.
>     mode:how to use them(bind,prefer,etc). change the mode to a
> cpuset->flags,such as CS_INTERLEAVE。
> task_struct->mems_allowed is equal to cpuset->effective_mems,which is
> hierarchical enforcement。CS_INTERLEAVE can also be updated into tasks,
> just like other flags(CS_SPREAD_PAGE).
> When a process needs to allocate memory, it can find the appropriate node to
> allocate pages according to the flag and mems_allowed.

I am not sure I see the advantage as the mode and nodes are always
closely coupled. You cannot really have one wihtout the other.
Zhongkun He Sept. 9, 2022, 2:55 a.m. UTC | #12
> On Wed 07-09-22 21:50:24, Zhongkun He wrote:
> [...]
>>> Do you really need to change the policy itself or only the effective
>>> nodemask? Do you need any other policy than bind and preferred?
>>
>> Yes, we need to change the policy, not only his nodemask. we really want
>> policy is interleave, and extend it to weight-interleave.
>> Say something like the following
>> 			node       weight
>>      interleave:		 0-3       1:1:1:1  default one by one
>>      weight-interleave:   0-3       1:2:4:6  alloc pages by weight
>> 					    (User set weight.)
>> In the actual usecase, the remaining resources of each node are different,
>> and the use of interleave cannot maximize the use of resources.
> 
> OK, this seems a separate topic. It would be good to start by proposing
> that new policy in isolation with the semantic description.
> 
>> Back to the previous question.
>>> The question is how to implement that with a sensible semantic.
>>
>> Thanks for your analysis and suggestions.It is really difficult to add
>> policy directly to cgroup for the hierarchical enforcement. It would be a
>> good idea to add pidfd_set_mempolicy.
> 
> Are you going to pursue that path?
>   
>> Also, there is a new idea.
>> We can try to separate the elements of mempolicy and use them independently.
>> Mempolicy has two meanings:
>>      nodes:which nodes to use(nodes,0-3), we can use cpuset's effective_mems
>> directly.
>>      mode:how to use them(bind,prefer,etc). change the mode to a
>> cpuset->flags,such as CS_INTERLEAVE。
>> task_struct->mems_allowed is equal to cpuset->effective_mems,which is
>> hierarchical enforcement。CS_INTERLEAVE can also be updated into tasks,
>> just like other flags(CS_SPREAD_PAGE).
>> When a process needs to allocate memory, it can find the appropriate node to
>> allocate pages according to the flag and mems_allowed.
> 
> I am not sure I see the advantage as the mode and nodes are always
> closely coupled. You cannot really have one wihtout the other.
> 

Hi Michal, thanks for your suggestion and reply.

 > Are you going to pursue that path?

Yes,I'll give it a try as it makes sense to modify the policy dynamically.

Thanks.
Zhongkun He Sept. 14, 2022, 3:10 p.m. UTC | #13
>>
>>> Back to the previous question.
>>>> The question is how to implement that with a sensible semantic.
>>>
>>> Thanks for your analysis and suggestions.It is really difficult to add
>>> policy directly to cgroup for the hierarchical enforcement. It would 
>>> be a good idea to add pidfd_set_mempolicy.
>>
>> Are you going to pursue that path?

> Hi Michal, thanks for your suggestion and reply.
> 
>  > Are you going to pursue that path?
> 
> Yes,I'll give it a try as it makes sense to modify the policy dynamically.
> 
> Thanks.

Hi Michal, i have a question about pidfd_set_mempolicy, it would be 
better if you have some suggestions.

The task_struct of processes and threads are independent. If we change 
the mempolicy of the process through pidfd_set_mempolicy, the mempolicy 
of its thread will not change. Of course users can set the mempolicy of 
all threads by iterating through /proc/tgid/task.

The question is whether we should override the thread's mempolicy when 
setting the process's mempolicy.

There are two options:
A:Change the process's mempolicy and set that mempolicy to all it's threads.
B:Only change the process's mempolicy in kernel. The mempolicy of the 
thread needs to be modified by the user through pidfd_set_mempolicy in
userspace, if necessary.

Thanks.
Michal Hocko Sept. 23, 2022, 7:29 a.m. UTC | #14
On Wed 14-09-22 23:10:47, Zhongkun He wrote:
> > > 
> > > > Back to the previous question.
> > > > > The question is how to implement that with a sensible semantic.
> > > > 
> > > > Thanks for your analysis and suggestions.It is really difficult to add
> > > > policy directly to cgroup for the hierarchical enforcement. It
> > > > would be a good idea to add pidfd_set_mempolicy.
> > > 
> > > Are you going to pursue that path?
> 
> > Hi Michal, thanks for your suggestion and reply.
> > 
> >  > Are you going to pursue that path?
> > 
> > Yes,I'll give it a try as it makes sense to modify the policy dynamically.
> > 
> > Thanks.
> 
> Hi Michal, i have a question about pidfd_set_mempolicy, it would be better
> if you have some suggestions.
> 
> The task_struct of processes and threads are independent. If we change the
> mempolicy of the process through pidfd_set_mempolicy, the mempolicy of its
> thread will not change. Of course users can set the mempolicy of all threads
> by iterating through /proc/tgid/task.
> 
> The question is whether we should override the thread's mempolicy when
> setting the process's mempolicy.
> 
> There are two options:
> A:Change the process's mempolicy and set that mempolicy to all it's threads.
> B:Only change the process's mempolicy in kernel. The mempolicy of the thread
> needs to be modified by the user through pidfd_set_mempolicy in
> userspace, if necessary.

set_mempolicy is a per task_struct operation and so should be pidfd
based API as well. If somebody requires a per-thread-group setting then
the whole group should be iterated. I do not think we have any
precendence where pidfd operation on the thread group leader has side
effects on other threads as well.
Zhongkun He Sept. 23, 2022, 3:26 p.m. UTC | #15
> 
> set_mempolicy is a per task_struct operation and so should be pidfd
> based API as well. If somebody requires a per-thread-group setting then
> the whole group should be iterated. I do not think we have any
> precendence where pidfd operation on the thread group leader has side
> effects on other threads as well.

Hi Michal,

I got it, thanks for your suggestions and reply.
diff mbox series

Patch

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 668389b4b53d..41c4b28fb71f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -142,6 +142,7 @@  extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask);
 
 extern int huge_node(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
@@ -211,6 +212,11 @@  static inline void mpol_get(struct mempolicy *pol)
 {
 }
 
+static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
+{
+	return NULL;
+}
+
 struct shared_policy {};
 
 static inline void mpol_shared_policy_init(struct shared_policy *sp,
@@ -252,6 +258,9 @@  static inline void mpol_rebind_task(struct task_struct *tsk,
 static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
+static inline void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+{
+}
 
 static inline int huge_node(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..f935e1707e7c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1236,6 +1236,8 @@  struct task_struct {
 	unsigned long			preempt_disable_ip;
 #endif
 #ifdef CONFIG_NUMA
+	/* cpuset memory policy */
+	struct mempolicy		*cs_mpol;
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
 	short				il_prev;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1f3a55297f39..19fd359dc8d8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -118,6 +118,9 @@  struct cpuset {
 	cpumask_var_t effective_cpus;
 	nodemask_t effective_mems;
 
+	/*cpuset mem policy */
+	struct mempolicy *mempolicy;
+
 	/*
 	 * CPUs allocated to child sub-partitions (default hierarchy only)
 	 * - CPUs granted by the parent = effective_cpus U subparts_cpus
@@ -378,6 +381,10 @@  static void cpuset_hotplug_workfn(struct work_struct *work);
 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
 
 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+static void cpuset_change_task_cs_mpol(struct task_struct *tsk,
+		struct mempolicy *mpol);
+static inline void update_cs_mpol_nodemask(struct cpuset *cs,
+		struct mempolicy *mpol);
 
 static inline void check_insane_mems_config(nodemask_t *nodes)
 {
@@ -570,7 +577,10 @@  static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 	if (!trial)
 		return NULL;
 
+	mpol_get(trial->mempolicy);
+
 	if (alloc_cpumasks(trial, NULL)) {
+		mpol_put(trial->mempolicy);
 		kfree(trial);
 		return NULL;
 	}
@@ -587,6 +597,7 @@  static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 static inline void free_cpuset(struct cpuset *cs)
 {
 	free_cpumasks(cs, NULL);
+	mpol_put(cs->mempolicy);
 	kfree(cs);
 }
 
@@ -1849,6 +1860,7 @@  static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 
 		spin_lock_irq(&callback_lock);
 		cp->effective_mems = *new_mems;
+		update_cs_mpol_nodemask(cp, cp->mempolicy);
 		spin_unlock_irq(&callback_lock);
 
 		WARN_ON(!is_in_v2_mode() &&
@@ -2304,7 +2316,8 @@  static void cpuset_attach(struct cgroup_taskset *tset)
 		 * fail.  TODO: have a better way to handle failure here
 		 */
 		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
-
+		/*update the cpuset mempolicy to task*/
+		cpuset_change_task_cs_mpol(task, cs->mempolicy);
 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
 		cpuset_update_task_spread_flag(cs, task);
 	}
@@ -2441,6 +2454,140 @@  static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	return retval;
 }
 
+#ifdef CONFIG_NUMA
+
+/*update the mpol nodemask based on cpuset  nodemask*/
+static inline void update_cs_mpol_nodemask(struct cpuset *cs, struct mempolicy *mpol)
+{
+	nodemask_t newmems;
+
+	if (!mpol)
+		return;
+	nodes_and(newmems, cs->effective_mems, mpol->w.user_nodemask);
+
+	/*If no intersection between effective_mems and mpol user_nodemask,
+	 *use effective_mems to allocate mems.
+	 */
+	if (!nodes_weight(newmems))
+		newmems = cs->effective_mems;
+	mpol_rebind_policy(mpol, &newmems);
+}
+
+/*update cpuset policy for task*/
+static void cpuset_change_task_cs_mpol(struct task_struct *tsk,
+		struct mempolicy *mpol)
+{
+	struct mempolicy *old = NULL;
+
+	task_lock(tsk);
+	local_irq_disable();
+	write_seqcount_begin(&tsk->mems_allowed_seq);
+
+	old = tsk->cs_mpol;
+	tsk->cs_mpol = mpol;
+	mpol_get(mpol);
+	tsk->il_prev = 0;
+
+	write_seqcount_end(&tsk->mems_allowed_seq);
+	local_irq_enable();
+	task_unlock(tsk);
+	mpol_put(old);
+}
+
+static void update_tasks_cs_mpol(struct cpuset *cs)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&cs->css, 0, &it);
+
+	while ((task = css_task_iter_next(&it)))
+		cpuset_change_task_cs_mpol(task, cs->mempolicy);
+	css_task_iter_end(&it);
+}
+
+/* change cpuset mempolicy */
+static ssize_t cpuset_mpol_write(struct kernfs_open_file *of,
+		char *buf, size_t nbytes, loff_t off)
+{
+	struct mempolicy *mpol, *old = NULL;
+	struct cpuset *cs = css_cs(of_css(of));
+	nodemask_t cs_allowed;
+	int err = -ENODEV;
+
+	css_get(&cs->css);
+	kernfs_break_active_protection(of->kn);
+	percpu_down_write(&cpuset_rwsem);
+
+	if (!is_cpuset_online(cs))
+		goto out_unlock;
+
+	buf = strstrip(buf);
+	err = mpol_parse_str(buf, &mpol);
+
+	if (err) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	spin_lock_irq(&callback_lock);
+	old = cs->mempolicy;
+	update_cs_mpol_nodemask(cs, mpol);
+	cs->mempolicy = mpol;
+	spin_unlock_irq(&callback_lock);
+
+	update_tasks_cs_mpol(cs);
+
+out_unlock:
+	percpu_up_write(&cpuset_rwsem);
+	kernfs_unbreak_active_protection(of->kn);
+	css_put(&cs->css);
+
+	if (old) {
+	/*Wait for outstanding programs to complete.*/
+		synchronize_rcu();
+		mpol_put(old);
+	}
+	return err ?: nbytes;
+}
+
+/*show cpuset mempolicy*/
+static int cpuset_mpol_show(struct seq_file *seq, void *v)
+{
+	char buffer[64];
+	int ret = 0;
+	struct mempolicy *mpol;
+	struct cpuset *cs = css_cs(seq_css(seq));
+
+	memset(buffer, 0, sizeof(buffer));
+	spin_lock_irq(&callback_lock);
+	mpol = cs->mempolicy;
+
+	if (!mpol || mpol->mode == MPOL_DEFAULT)
+		goto out_unlock;
+
+	mpol_to_str(buffer, sizeof(buffer), mpol);
+	seq_printf(seq, buffer);
+	seq_putc(seq, '\n');
+
+out_unlock:
+	spin_unlock_irq(&callback_lock);
+	return ret;
+}
+
+#else
+static void cpuset_change_task_cs_mpol(struct task_struct *tsk,
+		struct mempolicy *mpol)
+{
+}
+
+static inline void update_cs_mpol_nodemask(struct cpuset *cs,
+		struct mempolicy *mpol)
+{
+}
+
+#endif
+
 /*
  * Common handling for a write to a "cpus" or "mems" file.
  */
@@ -2678,7 +2825,14 @@  static struct cftype legacy_files[] = {
 		.seq_show = cpuset_common_seq_show,
 		.private = FILE_EFFECTIVE_MEMLIST,
 	},
-
+#ifdef CONFIG_NUMA
+	{
+		.name = "mems_policy",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpuset_mpol_show,
+		.write = cpuset_mpol_write,
+	},
+#endif
 	{
 		.name = "cpu_exclusive",
 		.read_u64 = cpuset_read_u64,
@@ -2786,7 +2940,14 @@  static struct cftype dfl_files[] = {
 		.seq_show = cpuset_common_seq_show,
 		.private = FILE_EFFECTIVE_MEMLIST,
 	},
-
+#ifdef CONFIG_NUMA
+	{
+		.name = "mems.policy",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpuset_mpol_show,
+		.write = cpuset_mpol_write,
+	},
+#endif
 	{
 		.name = "cpus.partition",
 		.seq_show = sched_partition_show,
@@ -2835,6 +2996,21 @@  cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;
 
+	if (IS_ENABLED(CONFIG_NUMA)) {
+		struct cpuset *pcs = css_cs(parent_css);
+		struct mempolicy *new;
+
+		/*Inherit mempolicy from parent.*/
+		spin_lock_irq(&callback_lock);
+		new = mpol_dup(pcs->mempolicy);
+
+		if (IS_ERR(new))
+			new = NULL;
+
+		cs->mempolicy = new;
+		spin_unlock_irq(&callback_lock);
+	}
+
 	/* Set CS_MEMORY_MIGRATE for default hierarchy */
 	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
 		__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index 90c85b17bf69..3f695449e2a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2190,6 +2190,7 @@  static __latent_entropy struct task_struct *copy_process(
 		p->mempolicy = NULL;
 		goto bad_fork_cleanup_delayacct;
 	}
+	mpol_get(p->cs_mpol); /*ref cpuset mempolicy*/
 #endif
 #ifdef CONFIG_CPUSETS
 	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b73d3248d976..144f79887df9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -158,9 +158,15 @@  int numa_map_to_online_node(int node)
 }
 EXPORT_SYMBOL_GPL(numa_map_to_online_node);
 
+static inline struct mempolicy *task_or_cs_mpol(struct task_struct *p)
+{
+	return p->mempolicy ?
+		p->mempolicy : p->cs_mpol;
+}
+
 struct mempolicy *get_task_policy(struct task_struct *p)
 {
-	struct mempolicy *pol = p->mempolicy;
+	struct mempolicy *pol = task_or_cs_mpol(p);
 	int node;
 
 	if (pol)
@@ -349,7 +355,7 @@  static void mpol_rebind_preferred(struct mempolicy *pol,
  * policies are protected by task->mems_allowed_seq to prevent a premature
  * OOM/allocation failure due to parallel nodemask modification.
  */
-static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 {
 	if (!pol || pol->mode == MPOL_LOCAL)
 		return;
@@ -1895,7 +1901,7 @@  unsigned int mempolicy_slab_node(void)
 	if (!in_task())
 		return node;
 
-	policy = current->mempolicy;
+	policy = task_or_cs_mpol(current);
 	if (!policy)
 		return node;
 
@@ -2043,7 +2049,7 @@  bool init_nodemask_of_mempolicy(nodemask_t *mask)
 		return false;
 
 	task_lock(current);
-	mempolicy = current->mempolicy;
+	mempolicy = task_or_cs_mpol(current);
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
@@ -2633,13 +2639,16 @@  int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
  */
 void mpol_put_task_policy(struct task_struct *task)
 {
-	struct mempolicy *pol;
+	struct mempolicy *pol, *cs_pol;
 
 	task_lock(task);
 	pol = task->mempolicy;
+	cs_pol = task->cs_mpol;
+	task->cs_mpol = NULL;
 	task->mempolicy = NULL;
 	task_unlock(task);
 	mpol_put(pol);
+	mpol_put(cs_pol);
 }
 
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
@@ -2957,8 +2966,7 @@  static const char * const policy_modes[] =
 	[MPOL_PREFERRED_MANY]  = "prefer (many)",
 };
 
-
-#ifdef CONFIG_TMPFS
+#if defined(CONFIG_TMPFS) || defined(CONFIG_NUMA)
 /**
  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
  * @str:  string containing mempolicy to parse
@@ -3054,6 +3062,9 @@  int mpol_parse_str(char *str, struct mempolicy **mpol)
 			mode_flags |= MPOL_F_RELATIVE_NODES;
 		else
 			goto out;
+	} else {
+		/*use static mode_flags in default*/
+		mode_flags |= MPOL_F_STATIC_NODES;
 	}
 
 	new = mpol_new(mode, mode_flags, &nodes);