Message ID | 20220904040241.1708-1-hezhongkun.hzk@bytedance.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | cgroup/cpuset: Add a new isolated mems.policy type. | expand |
Hi hezhongkun, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on c40e8341e3b3bb27e3a65b06b5b454626234c4f0] url: https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436 base: c40e8341e3b3bb27e3a65b06b5b454626234c4f0 config: x86_64-randconfig-a014 compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436 git checkout aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash kernel/cgroup/ If you fix the issue, kindly add following tag where applicable Reported-by: kernel test robot <lkp@intel.com> All warnings (new ones prefixed by >>): >> kernel/cgroup/cpuset.c:2515:13: warning: unused variable 'cs_allowed' [-Wunused-variable] nodemask_t cs_allowed; ^ 1 warning generated. vim +/cs_allowed +2515 kernel/cgroup/cpuset.c 2508 2509 /* change cpuset mempolicy */ 2510 static ssize_t cpuset_mpol_write(struct kernfs_open_file *of, 2511 char *buf, size_t nbytes, loff_t off) 2512 { 2513 struct mempolicy *mpol, *old = NULL; 2514 struct cpuset *cs = css_cs(of_css(of)); > 2515 nodemask_t cs_allowed; 2516 int err = -ENODEV; 2517 2518 css_get(&cs->css); 2519 kernfs_break_active_protection(of->kn); 2520 percpu_down_write(&cpuset_rwsem); 2521 2522 if (!is_cpuset_online(cs)) 2523 goto out_unlock; 2524 2525 buf = strstrip(buf); 2526 err = mpol_parse_str(buf, &mpol); 2527 2528 if (err) { 2529 err = -EINVAL; 2530 goto out_unlock; 2531 } 2532 2533 spin_lock_irq(&callback_lock); 2534 old = cs->mempolicy; 2535 update_cs_mpol_nodemask(cs, mpol); 2536 cs->mempolicy = mpol; 2537 spin_unlock_irq(&callback_lock); 2538 2539 update_tasks_cs_mpol(cs); 2540 2541 out_unlock: 2542 percpu_up_write(&cpuset_rwsem); 2543 kernfs_unbreak_active_protection(of->kn); 2544 css_put(&cs->css); 2545 2546 if (old) { 2547 /*Wait for outstanding programs to complete.*/ 2548 synchronize_rcu(); 2549 mpol_put(old); 2550 } 2551 return err ?: nbytes; 2552 } 2553
Hi hezhongkun, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on c40e8341e3b3bb27e3a65b06b5b454626234c4f0] url: https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436 base: c40e8341e3b3bb27e3a65b06b5b454626234c4f0 config: x86_64-defconfig compiler: gcc-11 (Debian 11.3.0-5) 11.3.0 reproduce (this is a W=1 build): # https://github.com/intel-lab-lkp/linux/commit/aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436 git checkout aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47 # save the config file mkdir build_dir && cp config build_dir/.config make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash kernel/cgroup/ If you fix the issue, kindly add following tag where applicable Reported-by: kernel test robot <lkp@intel.com> All warnings (new ones prefixed by >>): kernel/cgroup/cpuset.c: In function 'cpuset_mpol_write': >> kernel/cgroup/cpuset.c:2515:20: warning: unused variable 'cs_allowed' [-Wunused-variable] 2515 | nodemask_t cs_allowed; | ^~~~~~~~~~ vim +/cs_allowed +2515 kernel/cgroup/cpuset.c 2508 2509 /* change cpuset mempolicy */ 2510 static ssize_t cpuset_mpol_write(struct kernfs_open_file *of, 2511 char *buf, size_t nbytes, loff_t off) 2512 { 2513 struct mempolicy *mpol, *old = NULL; 2514 struct cpuset *cs = css_cs(of_css(of)); > 2515 nodemask_t cs_allowed; 2516 int err = -ENODEV; 2517 2518 css_get(&cs->css); 2519 kernfs_break_active_protection(of->kn); 2520 percpu_down_write(&cpuset_rwsem); 2521 2522 if (!is_cpuset_online(cs)) 2523 goto out_unlock; 2524 2525 buf = strstrip(buf); 2526 err = mpol_parse_str(buf, &mpol); 2527 2528 if (err) { 2529 err = -EINVAL; 2530 goto out_unlock; 2531 } 2532 2533 spin_lock_irq(&callback_lock); 2534 old = cs->mempolicy; 2535 update_cs_mpol_nodemask(cs, mpol); 2536 cs->mempolicy = mpol; 2537 spin_unlock_irq(&callback_lock); 2538 2539 update_tasks_cs_mpol(cs); 2540 2541 out_unlock: 2542 percpu_up_write(&cpuset_rwsem); 2543 kernfs_unbreak_active_protection(of->kn); 2544 css_put(&cs->css); 2545 2546 if (old) { 2547 /*Wait for outstanding programs to complete.*/ 2548 synchronize_rcu(); 2549 mpol_put(old); 2550 } 2551 return err ?: nbytes; 2552 } 2553
Hi hezhongkun,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on c40e8341e3b3bb27e3a65b06b5b454626234c4f0]
url: https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
base: c40e8341e3b3bb27e3a65b06b5b454626234c4f0
config: s390-randconfig-r044-20220904
compiler: s390-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436
git checkout aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
>> mm/mempolicy.c:2980:5: warning: no previous prototype for 'mpol_parse_str' [-Wmissing-prototypes]
2980 | int mpol_parse_str(char *str, struct mempolicy **mpol)
| ^~~~~~~~~~~~~~
vim +/mpol_parse_str +2980 mm/mempolicy.c
1a75a6c825c172 Christoph Lameter 2006-01-08 2968
aafa4e2c9e5c3c Zhongkun He 2022-09-04 2969 #if defined(CONFIG_TMPFS) || defined(CONFIG_NUMA)
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2970 /**
f2a07f40dbc603 Hugh Dickins 2013-01-02 2971 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2972 * @str: string containing mempolicy to parse
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 2973 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2974 *
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2975 * Format of input:
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2976 * <mode>[=<flags>][:<nodelist>]
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2977 *
dad5b023294981 Randy Dunlap 2022-01-14 2978 * Return: %0 on success, else %1
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2979 */
a7a88b23737095 Hugh Dickins 2013-01-02 @2980 int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2981 {
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 2982 struct mempolicy *new = NULL;
f2a07f40dbc603 Hugh Dickins 2013-01-02 2983 unsigned short mode_flags;
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 2984 nodemask_t nodes;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2985 char *nodelist = strchr(str, ':');
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2986 char *flags = strchr(str, '=');
dedf2c73b80b45 zhong jiang 2018-10-26 2987 int err = 1, mode;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2988
c7a91bc7c2e17e Dan Carpenter 2020-01-30 2989 if (flags)
c7a91bc7c2e17e Dan Carpenter 2020-01-30 2990 *flags++ = '\0'; /* terminate mode string */
c7a91bc7c2e17e Dan Carpenter 2020-01-30 2991
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2992 if (nodelist) {
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2993 /* NUL-terminate mode or flags string */
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2994 *nodelist++ = '\0';
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 2995 if (nodelist_parse(nodelist, nodes))
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2996 goto out;
01f13bd607346a Lai Jiangshan 2012-12-12 2997 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 2998 goto out;
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 2999 } else
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3000 nodes_clear(nodes);
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3001
dedf2c73b80b45 zhong jiang 2018-10-26 3002 mode = match_string(policy_modes, MPOL_MAX, str);
dedf2c73b80b45 zhong jiang 2018-10-26 3003 if (mode < 0)
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3004 goto out;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3005
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3006 switch (mode) {
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3007 case MPOL_PREFERRED:
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3008 /*
aa9f7d5172fac9 Randy Dunlap 2020-04-01 3009 * Insist on a nodelist of one node only, although later
aa9f7d5172fac9 Randy Dunlap 2020-04-01 3010 * we use first_node(nodes) to grab a single node, so here
aa9f7d5172fac9 Randy Dunlap 2020-04-01 3011 * nodelist (or nodes) cannot be empty.
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3012 */
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3013 if (nodelist) {
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3014 char *rest = nodelist;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3015 while (isdigit(*rest))
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3016 rest++;
926f2ae04f1830 KOSAKI Motohiro 2010-03-23 3017 if (*rest)
926f2ae04f1830 KOSAKI Motohiro 2010-03-23 3018 goto out;
aa9f7d5172fac9 Randy Dunlap 2020-04-01 3019 if (nodes_empty(nodes))
aa9f7d5172fac9 Randy Dunlap 2020-04-01 3020 goto out;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3021 }
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3022 break;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3023 case MPOL_INTERLEAVE:
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3024 /*
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3025 * Default to online nodes with memory if no nodelist
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3026 */
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3027 if (!nodelist)
01f13bd607346a Lai Jiangshan 2012-12-12 3028 nodes = node_states[N_MEMORY];
3f226aa1cbc006 Lee Schermerhorn 2008-04-28 3029 break;
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3030 case MPOL_LOCAL:
3f226aa1cbc006 Lee Schermerhorn 2008-04-28 3031 /*
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3032 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1cbc006 Lee Schermerhorn 2008-04-28 3033 */
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3034 if (nodelist)
3f226aa1cbc006 Lee Schermerhorn 2008-04-28 3035 goto out;
3f226aa1cbc006 Lee Schermerhorn 2008-04-28 3036 break;
413b43deab8377 Ravikiran G Thirumalai 2010-03-23 3037 case MPOL_DEFAULT:
413b43deab8377 Ravikiran G Thirumalai 2010-03-23 3038 /*
413b43deab8377 Ravikiran G Thirumalai 2010-03-23 3039 * Insist on a empty nodelist
413b43deab8377 Ravikiran G Thirumalai 2010-03-23 3040 */
413b43deab8377 Ravikiran G Thirumalai 2010-03-23 3041 if (!nodelist)
413b43deab8377 Ravikiran G Thirumalai 2010-03-23 3042 err = 0;
413b43deab8377 Ravikiran G Thirumalai 2010-03-23 3043 goto out;
b27abaccf8e8b0 Dave Hansen 2021-09-02 3044 case MPOL_PREFERRED_MANY:
d69b2e63e9172a KOSAKI Motohiro 2010-03-23 3045 case MPOL_BIND:
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3046 /*
d69b2e63e9172a KOSAKI Motohiro 2010-03-23 3047 * Insist on a nodelist
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3048 */
d69b2e63e9172a KOSAKI Motohiro 2010-03-23 3049 if (!nodelist)
d69b2e63e9172a KOSAKI Motohiro 2010-03-23 3050 goto out;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3051 }
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3052
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3053 mode_flags = 0;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3054 if (flags) {
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3055 /*
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3056 * Currently, we only support two mutually exclusive
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3057 * mode flags.
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3058 */
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3059 if (!strcmp(flags, "static"))
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3060 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3061 else if (!strcmp(flags, "relative"))
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3062 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3063 else
926f2ae04f1830 KOSAKI Motohiro 2010-03-23 3064 goto out;
aafa4e2c9e5c3c Zhongkun He 2022-09-04 3065 } else {
aafa4e2c9e5c3c Zhongkun He 2022-09-04 3066 /*use static mode_flags in default*/
aafa4e2c9e5c3c Zhongkun He 2022-09-04 3067 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3068 }
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3069
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3070 new = mpol_new(mode, mode_flags, &nodes);
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3071 if (IS_ERR(new))
926f2ae04f1830 KOSAKI Motohiro 2010-03-23 3072 goto out;
926f2ae04f1830 KOSAKI Motohiro 2010-03-23 3073
f2a07f40dbc603 Hugh Dickins 2013-01-02 3074 /*
f2a07f40dbc603 Hugh Dickins 2013-01-02 3075 * Save nodes for mpol_to_str() to show the tmpfs mount options
f2a07f40dbc603 Hugh Dickins 2013-01-02 3076 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
f2a07f40dbc603 Hugh Dickins 2013-01-02 3077 */
269fbe72cded0a Ben Widawsky 2021-06-30 3078 if (mode != MPOL_PREFERRED) {
269fbe72cded0a Ben Widawsky 2021-06-30 3079 new->nodes = nodes;
269fbe72cded0a Ben Widawsky 2021-06-30 3080 } else if (nodelist) {
269fbe72cded0a Ben Widawsky 2021-06-30 3081 nodes_clear(new->nodes);
269fbe72cded0a Ben Widawsky 2021-06-30 3082 node_set(first_node(nodes), new->nodes);
269fbe72cded0a Ben Widawsky 2021-06-30 3083 } else {
7858d7bca7fbbb Feng Tang 2021-06-30 3084 new->mode = MPOL_LOCAL;
269fbe72cded0a Ben Widawsky 2021-06-30 3085 }
f2a07f40dbc603 Hugh Dickins 2013-01-02 3086
f2a07f40dbc603 Hugh Dickins 2013-01-02 3087 /*
f2a07f40dbc603 Hugh Dickins 2013-01-02 3088 * Save nodes for contextualization: this will be used to "clone"
f2a07f40dbc603 Hugh Dickins 2013-01-02 3089 * the mempolicy in a specific context [cpuset] at a later time.
f2a07f40dbc603 Hugh Dickins 2013-01-02 3090 */
e17f74af351cce Lee Schermerhorn 2010-05-24 3091 new->w.user_nodemask = nodes;
f2a07f40dbc603 Hugh Dickins 2013-01-02 3092
926f2ae04f1830 KOSAKI Motohiro 2010-03-23 3093 err = 0;
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3094
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3095 out:
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3096 /* Restore string for error message */
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3097 if (nodelist)
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3098 *--nodelist = ':';
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3099 if (flags)
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3100 *--flags = '=';
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3101 if (!err)
71fe804b6d56d6 Lee Schermerhorn 2008-04-28 3102 *mpol = new;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3103 return err;
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3104 }
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3105 #endif /* CONFIG_TMPFS */
095f1fc4ebf36c Lee Schermerhorn 2008-04-28 3106
Hi hezhongkun, Thank you for the patch! Yet something to improve: [auto build test ERROR on c40e8341e3b3bb27e3a65b06b5b454626234c4f0] url: https://github.com/intel-lab-lkp/linux/commits/hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436 base: c40e8341e3b3bb27e3a65b06b5b454626234c4f0 config: ia64-randconfig-c44-20220904 compiler: ia64-linux-gcc (GCC) 12.1.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47 git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review hezhongkun/cgroup-cpuset-Add-a-new-isolated-mems-policy-type/20220904-120436 git checkout aafa4e2c9e5c3cd4d02190ac2a7ed0654ed9cf47 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=ia64 SHELL=/bin/bash If you fix the issue, kindly add following tag where applicable Reported-by: kernel test robot <lkp@intel.com> All errors (new ones prefixed by >>): kernel/cgroup/cpuset.c: In function 'cpuset_mpol_write': >> kernel/cgroup/cpuset.c:2526:15: error: implicit declaration of function 'mpol_parse_str'; did you mean 'mpol_to_str'? [-Werror=implicit-function-declaration] 2526 | err = mpol_parse_str(buf, &mpol); | ^~~~~~~~~~~~~~ | mpol_to_str kernel/cgroup/cpuset.c:2515:20: warning: unused variable 'cs_allowed' [-Wunused-variable] 2515 | nodemask_t cs_allowed; | ^~~~~~~~~~ cc1: some warnings being treated as errors vim +2526 kernel/cgroup/cpuset.c 2508 2509 /* change cpuset mempolicy */ 2510 static ssize_t cpuset_mpol_write(struct kernfs_open_file *of, 2511 char *buf, size_t nbytes, loff_t off) 2512 { 2513 struct mempolicy *mpol, *old = NULL; 2514 struct cpuset *cs = css_cs(of_css(of)); 2515 nodemask_t cs_allowed; 2516 int err = -ENODEV; 2517 2518 css_get(&cs->css); 2519 kernfs_break_active_protection(of->kn); 2520 percpu_down_write(&cpuset_rwsem); 2521 2522 if (!is_cpuset_online(cs)) 2523 goto out_unlock; 2524 2525 buf = strstrip(buf); > 2526 err = mpol_parse_str(buf, &mpol); 2527 2528 if (err) { 2529 err = -EINVAL; 2530 goto out_unlock; 2531 } 2532 2533 spin_lock_irq(&callback_lock); 2534 old = cs->mempolicy; 2535 update_cs_mpol_nodemask(cs, mpol); 2536 cs->mempolicy = mpol; 2537 spin_unlock_irq(&callback_lock); 2538 2539 update_tasks_cs_mpol(cs); 2540 2541 out_unlock: 2542 percpu_up_write(&cpuset_rwsem); 2543 kernfs_unbreak_active_protection(of->kn); 2544 css_put(&cs->css); 2545 2546 if (old) { 2547 /*Wait for outstanding programs to complete.*/ 2548 synchronize_rcu(); 2549 mpol_put(old); 2550 } 2551 return err ?: nbytes; 2552 } 2553
On Sun 04-09-22 12:02:41, hezhongkun wrote: > From: Zhongkun He <hezhongkun.hzk@bytedance.com> > > Mempolicy is difficult to use because it is set in-process > via a system call. We want to make it easier to use mempolicy > in cpuset, and we can control low-priority cgroups to > allocate memory in specified nodes. So this patch want to > adds the mempolicy interface in cpuset. > > The mempolicy priority of cpuset is lower than the task. > The order of getting the policy is: > 1) vma mempolicy > 2) task->mempolicy > 3) cpuset->mempolicy > 4) default policy. > > cpuset's policy is owned by itself, but descendants will > get the default mempolicy from parent. What is the hierarchical behavior of the policy? Say parent has a stronger requirement (say bind) than a child (prefer)? > How to use the mempolicy interface: > echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy > echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy > echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy Am I just confused or did you really mean to combine all these together?
Hi Michal, thanks for your reply. The current 'mempolicy' is hierarchically independent. The default value of the child is to inherit from the parent. The modification of the child policy will not be restricted by the parent. Of course, there are other options, such as the child's policy mode must be the same as the parent's. node can be the subset of parent's, but the interleave type will be complicated, that's why hierarchy independence is used. It would be better if you have other suggestions? Thanks. > On Sun 04-09-22 12:02:41, hezhongkun wrote: >> From: Zhongkun He <hezhongkun.hzk@bytedance.com> >> >> Mempolicy is difficult to use because it is set in-process >> via a system call. We want to make it easier to use mempolicy >> in cpuset, and we can control low-priority cgroups to >> allocate memory in specified nodes. So this patch want to >> adds the mempolicy interface in cpuset. >> >> The mempolicy priority of cpuset is lower than the task. >> The order of getting the policy is: >> 1) vma mempolicy >> 2) task->mempolicy >> 3) cpuset->mempolicy >> 4) default policy. >> >> cpuset's policy is owned by itself, but descendants will >> get the default mempolicy from parent. > > What is the hierarchical behavior of the policy? Say parent has a > stronger requirement (say bind) than a child (prefer)? > >> How to use the mempolicy interface: >> echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy >> echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy >> echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy > > Am I just confused or did you really mean to combine all these > together?
On Mon 05-09-22 18:30:55, Zhongkun He wrote: > Hi Michal, thanks for your reply. > > The current 'mempolicy' is hierarchically independent. The default value of > the child is to inherit from the parent. The modification of the child > policy will not be restricted by the parent. This breaks cgroup fundamental property of hierarchical enforcement of each property. And as such it is a no go. > Of course, there are other options, such as the child's policy mode must be > the same as the parent's. node can be the subset of parent's, but the > interleave type will be complicated, that's why hierarchy independence is > used. It would be better if you have other suggestions? Honestly, I am not really sure cgroup cpusets is a great fit for this usecase. It would be probably better to elaborate some more what are the existing shortcomings and what you would like to achieve. Just stating the syscall is a hard to use interface is not quite clear on its own. Btw. have you noticed this question? > > What is the hierarchical behavior of the policy? Say parent has a > > stronger requirement (say bind) than a child (prefer)? > > > How to use the mempolicy interface: > > > echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy > > > echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy > > > echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy > > > > Am I just confused or did you really mean to combine all these > > together?
> On Mon 05-09-22 18:30:55, Zhongkun He wrote: >> Hi Michal, thanks for your reply. >> >> The current 'mempolicy' is hierarchically independent. The default value of >> the child is to inherit from the parent. The modification of the child >> policy will not be restricted by the parent. > > This breaks cgroup fundamental property of hierarchical enforcement of > each property. And as such it is a no go. > >> Of course, there are other options, such as the child's policy mode must be >> the same as the parent's. node can be the subset of parent's, but the >> interleave type will be complicated, that's why hierarchy independence is >> used. It would be better if you have other suggestions? > > Honestly, I am not really sure cgroup cpusets is a great fit for this > usecase. It would be probably better to elaborate some more what are the > existing shortcomings and what you would like to achieve. Just stating > the syscall is a hard to use interface is not quite clear on its own. > > Btw. have you noticed this question? > >>> What is the hierarchical behavior of the policy? Say parent has a >>> stronger requirement (say bind) than a child (prefer)? >>>> How to use the mempolicy interface: >>>> echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy >>>> echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy >>>> echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy >>> >>> Am I just confused or did you really mean to combine all these >>> together? > Hi Michal, thanks for your reply. >>Say parent has a stronger requirement (say bind) than a child(prefer)? Yes, combine all these together. The parent's task will use 'bind', child's use 'prefer'.This is the current implementation, and we can discuss and modify it together if there are other suggestions. 1:Existing shortcomings In our use case, the application and the control plane are two separate systems. When the application is created, it doesn't know how to use memory, and it doesn't care. The control plane will decide the memory usage policy based on different reasons (the attributes of the application itself, the priority, the remaining resources of the system). Currently, numactl is used to set it at program startup, and the child process will inherit the mempolicy. But we can't dynamically adjust the memory policy, except restart, the memory policy will not change. 2:Our goals For the above reasons, we want to create a mempolicy at the cgroup level. Usually processes under a cgroup have the same priority and attributes, and we can dynamically adjust the memory allocation strategy according to the remaining resources of the system. For example, a low-priority cgroup uses the 'bind:2-3' policy, and a high-priority cgroup uses bind:0-1. When resources are insufficient, it will be changed to bind:3, bind:0-2 by control plane, etc.Further more, more mempolicy can be extended, such as allocating memory according to node weight, etc. Thanks.
On Tue 06-09-22 18:37:40, Zhongkun He wrote: > > On Mon 05-09-22 18:30:55, Zhongkun He wrote: > > > Hi Michal, thanks for your reply. > > > > > > The current 'mempolicy' is hierarchically independent. The default value of > > > the child is to inherit from the parent. The modification of the child > > > policy will not be restricted by the parent. > > > > This breaks cgroup fundamental property of hierarchical enforcement of > > each property. And as such it is a no go. > > > > > Of course, there are other options, such as the child's policy mode must be > > > the same as the parent's. node can be the subset of parent's, but the > > > interleave type will be complicated, that's why hierarchy independence is > > > used. It would be better if you have other suggestions? > > > > Honestly, I am not really sure cgroup cpusets is a great fit for this > > usecase. It would be probably better to elaborate some more what are the > > existing shortcomings and what you would like to achieve. Just stating > > the syscall is a hard to use interface is not quite clear on its own. > > > > Btw. have you noticed this question? > > > > > > What is the hierarchical behavior of the policy? Say parent has a > > > > stronger requirement (say bind) than a child (prefer)? > > > > > How to use the mempolicy interface: > > > > > echo prefer:2 > /sys/fs/cgroup/zz/cpuset.mems.policy > > > > > echo bind:1-3 > /sys/fs/cgroup/zz/cpuset.mems.policy > > > > > echo interleave:0,1,2,3 >/sys/fs/cgroup/zz/cpuset.mems.policy > > > > > > > > Am I just confused or did you really mean to combine all these > > > > together? > > > > Hi Michal, thanks for your reply. > > >>Say parent has a stronger requirement (say bind) than a child(prefer)? > > Yes, combine all these together. What is the semantic of the resulting policy? > The parent's task will use 'bind', child's > use 'prefer'.This is the current implementation, and we can discuss and > modify it together if there are other suggestions. > > 1:Existing shortcomings > > In our use case, the application and the control plane are two separate > systems. When the application is created, it doesn't know how to use memory, > and it doesn't care. The control plane will decide the memory usage policy > based on different reasons (the attributes of the application itself, the > priority, the remaining resources of the system). Currently, numactl is used > to set it at program startup, and the child process will inherit the > mempolicy. Yes this is common practice I have seen so far. > But we can't dynamically adjust the memory policy, except > restart, the memory policy will not change. Do you really need to change the policy itself or only the effective nodemask? I mean what is your usecase to go from say mbind to preferred policy? Do you need any other policy than bind and preferred? > 2:Our goals > > For the above reasons, we want to create a mempolicy at the cgroup level. > Usually processes under a cgroup have the same priority and attributes, and > we can dynamically adjust the memory allocation strategy according to the > remaining resources of the system. For example, a low-priority cgroup uses > the 'bind:2-3' policy, and a high-priority cgroup uses bind:0-1. When > resources are insufficient, it will be changed to bind:3, bind:0-2 by > control plane, etc.Further more, more mempolicy can be extended, such as > allocating memory according to node weight, etc. Yes, I do understand that you want to change the node affinity and that is already possible with cpuset cgroup. The existing constrain is that the policy is hardcoded mbind IIRC. So you cannot really implement a dynamic preferred policy which would make some sense to me. The question is how to implement that with a sensible semantic. It is hard to partition the system into several cgroups if subset allows to spill over to others. Say something like the following root (nodes=0-3) / \ A (0, 1) B (2, 3) if both are MBIND then this makes sense because they are kinda isolated (at least for user allocations) but if B is PREFERRED and therefore allowed to use nodes 0 and 1 then it can deplete the memory from A and therefore isolation doesn't work at all. I can imagine that the all cgroups would use PREFERRED policy and then nobody can expect anything and the configuration is mostly best effort. But it feels like this is an abuse of the cgroup interface and a proper syscall interface is likely due. Would it make more sense to add pidfd_set_mempolicy and allow sufficiently privileged process to manipulate default memory policy of a remote process?
>> Hi Michal, thanks for your reply. >> >>>> Say parent has a stronger requirement (say bind) than a child(prefer)? >> >> Yes, combine all these together. > > What is the semantic of the resulting policy? > >> The parent's task will use 'bind', child's >> use 'prefer'.This is the current implementation, and we can discuss and >> modify it together if there are other suggestions. >> >> 1:Existing shortcomings >> >> In our use case, the application and the control plane are two separate >> systems. When the application is created, it doesn't know how to use memory, >> and it doesn't care. The control plane will decide the memory usage policy >> based on different reasons (the attributes of the application itself, the >> priority, the remaining resources of the system). Currently, numactl is used >> to set it at program startup, and the child process will inherit the >> mempolicy. > > Yes this is common practice I have seen so far. > >> But we can't dynamically adjust the memory policy, except >> restart, the memory policy will not change. > > Do you really need to change the policy itself or only the effective > nodemask? I mean what is your usecase to go from say mbind to preferred > policy? Do you need any other policy than bind and preferred? > >> 2:Our goals >> >> For the above reasons, we want to create a mempolicy at the cgroup level. >> Usually processes under a cgroup have the same priority and attributes, and >> we can dynamically adjust the memory allocation strategy according to the >> remaining resources of the system. For example, a low-priority cgroup uses >> the 'bind:2-3' policy, and a high-priority cgroup uses bind:0-1. When >> resources are insufficient, it will be changed to bind:3, bind:0-2 by >> control plane, etc.Further more, more mempolicy can be extended, such as >> allocating memory according to node weight, etc. > > Yes, I do understand that you want to change the node affinity and that > is already possible with cpuset cgroup. The existing constrain is that > the policy is hardcoded mbind IIRC. So you cannot really implement a dynamic > preferred policy which would make some sense to me. The question is how > to implement that with a sensible semantic. It is hard to partition the > system into several cgroups if subset allows to spill over to others. > Say something like the following > root (nodes=0-3) > / \ > A (0, 1) B (2, 3) > > if both are MBIND then this makes sense because they are kinda isolated > (at least for user allocations) but if B is PREFERRED and therefore > allowed to use nodes 0 and 1 then it can deplete the memory from A and > therefore isolation doesn't work at all. > > I can imagine that the all cgroups would use PREFERRED policy and then > nobody can expect anything and the configuration is mostly best effort. > But it feels like this is an abuse of the cgroup interface and a proper > syscall interface is likely due. Would it make more sense to add > pidfd_set_mempolicy and allow sufficiently privileged process to > manipulate default memory policy of a remote process? Hi Michal, thanks for your reply. > Do you really need to change the policy itself or only the effective > nodemask? Do you need any other policy than bind and preferred? Yes, we need to change the policy, not only his nodemask. we really want policy is interleave, and extend it to weight-interleave. Say something like the following node weight interleave: 0-3 1:1:1:1 default one by one weight-interleave: 0-3 1:2:4:6 alloc pages by weight (User set weight.) In the actual usecase, the remaining resources of each node are different, and the use of interleave cannot maximize the use of resources. Back to the previous question. >The question is how to implement that with a sensible semantic. Thanks for your analysis and suggestions.It is really difficult to add policy directly to cgroup for the hierarchical enforcement. It would be a good idea to add pidfd_set_mempolicy. Also, there is a new idea. We can try to separate the elements of mempolicy and use them independently. Mempolicy has two meanings: nodes:which nodes to use(nodes,0-3), we can use cpuset's effective_mems directly. mode:how to use them(bind,prefer,etc). change the mode to a cpuset->flags,such as CS_INTERLEAVE。 task_struct->mems_allowed is equal to cpuset->effective_mems,which is hierarchical enforcement。CS_INTERLEAVE can also be updated into tasks, just like other flags(CS_SPREAD_PAGE). When a process needs to allocate memory, it can find the appropriate node to allocate pages according to the flag and mems_allowed. thanks.
On Wed 07-09-22 21:50:24, Zhongkun He wrote: [...] > > Do you really need to change the policy itself or only the effective > > nodemask? Do you need any other policy than bind and preferred? > > Yes, we need to change the policy, not only his nodemask. we really want > policy is interleave, and extend it to weight-interleave. > Say something like the following > node weight > interleave: 0-3 1:1:1:1 default one by one > weight-interleave: 0-3 1:2:4:6 alloc pages by weight > (User set weight.) > In the actual usecase, the remaining resources of each node are different, > and the use of interleave cannot maximize the use of resources. OK, this seems a separate topic. It would be good to start by proposing that new policy in isolation with the semantic description. > Back to the previous question. > >The question is how to implement that with a sensible semantic. > > Thanks for your analysis and suggestions.It is really difficult to add > policy directly to cgroup for the hierarchical enforcement. It would be a > good idea to add pidfd_set_mempolicy. Are you going to pursue that path? > Also, there is a new idea. > We can try to separate the elements of mempolicy and use them independently. > Mempolicy has two meanings: > nodes:which nodes to use(nodes,0-3), we can use cpuset's effective_mems > directly. > mode:how to use them(bind,prefer,etc). change the mode to a > cpuset->flags,such as CS_INTERLEAVE。 > task_struct->mems_allowed is equal to cpuset->effective_mems,which is > hierarchical enforcement。CS_INTERLEAVE can also be updated into tasks, > just like other flags(CS_SPREAD_PAGE). > When a process needs to allocate memory, it can find the appropriate node to > allocate pages according to the flag and mems_allowed. I am not sure I see the advantage as the mode and nodes are always closely coupled. You cannot really have one wihtout the other.
> On Wed 07-09-22 21:50:24, Zhongkun He wrote: > [...] >>> Do you really need to change the policy itself or only the effective >>> nodemask? Do you need any other policy than bind and preferred? >> >> Yes, we need to change the policy, not only his nodemask. we really want >> policy is interleave, and extend it to weight-interleave. >> Say something like the following >> node weight >> interleave: 0-3 1:1:1:1 default one by one >> weight-interleave: 0-3 1:2:4:6 alloc pages by weight >> (User set weight.) >> In the actual usecase, the remaining resources of each node are different, >> and the use of interleave cannot maximize the use of resources. > > OK, this seems a separate topic. It would be good to start by proposing > that new policy in isolation with the semantic description. > >> Back to the previous question. >>> The question is how to implement that with a sensible semantic. >> >> Thanks for your analysis and suggestions.It is really difficult to add >> policy directly to cgroup for the hierarchical enforcement. It would be a >> good idea to add pidfd_set_mempolicy. > > Are you going to pursue that path? > >> Also, there is a new idea. >> We can try to separate the elements of mempolicy and use them independently. >> Mempolicy has two meanings: >> nodes:which nodes to use(nodes,0-3), we can use cpuset's effective_mems >> directly. >> mode:how to use them(bind,prefer,etc). change the mode to a >> cpuset->flags,such as CS_INTERLEAVE。 >> task_struct->mems_allowed is equal to cpuset->effective_mems,which is >> hierarchical enforcement。CS_INTERLEAVE can also be updated into tasks, >> just like other flags(CS_SPREAD_PAGE). >> When a process needs to allocate memory, it can find the appropriate node to >> allocate pages according to the flag and mems_allowed. > > I am not sure I see the advantage as the mode and nodes are always > closely coupled. You cannot really have one wihtout the other. > Hi Michal, thanks for your suggestion and reply. > Are you going to pursue that path? Yes,I'll give it a try as it makes sense to modify the policy dynamically. Thanks.
>> >>> Back to the previous question. >>>> The question is how to implement that with a sensible semantic. >>> >>> Thanks for your analysis and suggestions.It is really difficult to add >>> policy directly to cgroup for the hierarchical enforcement. It would >>> be a good idea to add pidfd_set_mempolicy. >> >> Are you going to pursue that path? > Hi Michal, thanks for your suggestion and reply. > > > Are you going to pursue that path? > > Yes,I'll give it a try as it makes sense to modify the policy dynamically. > > Thanks. Hi Michal, i have a question about pidfd_set_mempolicy, it would be better if you have some suggestions. The task_struct of processes and threads are independent. If we change the mempolicy of the process through pidfd_set_mempolicy, the mempolicy of its thread will not change. Of course users can set the mempolicy of all threads by iterating through /proc/tgid/task. The question is whether we should override the thread's mempolicy when setting the process's mempolicy. There are two options: A:Change the process's mempolicy and set that mempolicy to all it's threads. B:Only change the process's mempolicy in kernel. The mempolicy of the thread needs to be modified by the user through pidfd_set_mempolicy in userspace, if necessary. Thanks.
On Wed 14-09-22 23:10:47, Zhongkun He wrote: > > > > > > > Back to the previous question. > > > > > The question is how to implement that with a sensible semantic. > > > > > > > > Thanks for your analysis and suggestions.It is really difficult to add > > > > policy directly to cgroup for the hierarchical enforcement. It > > > > would be a good idea to add pidfd_set_mempolicy. > > > > > > Are you going to pursue that path? > > > Hi Michal, thanks for your suggestion and reply. > > > > > Are you going to pursue that path? > > > > Yes,I'll give it a try as it makes sense to modify the policy dynamically. > > > > Thanks. > > Hi Michal, i have a question about pidfd_set_mempolicy, it would be better > if you have some suggestions. > > The task_struct of processes and threads are independent. If we change the > mempolicy of the process through pidfd_set_mempolicy, the mempolicy of its > thread will not change. Of course users can set the mempolicy of all threads > by iterating through /proc/tgid/task. > > The question is whether we should override the thread's mempolicy when > setting the process's mempolicy. > > There are two options: > A:Change the process's mempolicy and set that mempolicy to all it's threads. > B:Only change the process's mempolicy in kernel. The mempolicy of the thread > needs to be modified by the user through pidfd_set_mempolicy in > userspace, if necessary. set_mempolicy is a per task_struct operation and so should be pidfd based API as well. If somebody requires a per-thread-group setting then the whole group should be iterated. I do not think we have any precendence where pidfd operation on the thread group leader has side effects on other threads as well.
> > set_mempolicy is a per task_struct operation and so should be pidfd > based API as well. If somebody requires a per-thread-group setting then > the whole group should be iterated. I do not think we have any > precendence where pidfd operation on the thread group leader has side > effects on other threads as well. Hi Michal, I got it, thanks for your suggestions and reply.
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 668389b4b53d..41c4b28fb71f 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -142,6 +142,7 @@ extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); +extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, @@ -211,6 +212,11 @@ static inline void mpol_get(struct mempolicy *pol) { } +static inline struct mempolicy *mpol_dup(struct mempolicy *pol) +{ + return NULL; +} + struct shared_policy {}; static inline void mpol_shared_policy_init(struct shared_policy *sp, @@ -252,6 +258,9 @@ static inline void mpol_rebind_task(struct task_struct *tsk, static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } +static inline void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) +{ +} static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, diff --git a/include/linux/sched.h b/include/linux/sched.h index e7b2f8a5c711..f935e1707e7c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1236,6 +1236,8 @@ struct task_struct { unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA + /* cpuset memory policy */ + struct mempolicy *cs_mpol; /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1f3a55297f39..19fd359dc8d8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -118,6 +118,9 @@ struct cpuset { cpumask_var_t effective_cpus; nodemask_t effective_mems; + /*cpuset mem policy */ + struct mempolicy *mempolicy; + /* * CPUs allocated to child sub-partitions (default hierarchy only) * - CPUs granted by the parent = effective_cpus U subparts_cpus @@ -378,6 +381,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); +static void cpuset_change_task_cs_mpol(struct task_struct *tsk, + struct mempolicy *mpol); +static inline void update_cs_mpol_nodemask(struct cpuset *cs, + struct mempolicy *mpol); static inline void check_insane_mems_config(nodemask_t *nodes) { @@ -570,7 +577,10 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) if (!trial) return NULL; + mpol_get(trial->mempolicy); + if (alloc_cpumasks(trial, NULL)) { + mpol_put(trial->mempolicy); kfree(trial); return NULL; } @@ -587,6 +597,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) static inline void free_cpuset(struct cpuset *cs) { free_cpumasks(cs, NULL); + mpol_put(cs->mempolicy); kfree(cs); } @@ -1849,6 +1860,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) spin_lock_irq(&callback_lock); cp->effective_mems = *new_mems; + update_cs_mpol_nodemask(cp, cp->mempolicy); spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && @@ -2304,7 +2316,8 @@ static void cpuset_attach(struct cgroup_taskset *tset) * fail. TODO: have a better way to handle failure here */ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); - + /*update the cpuset mempolicy to task*/ + cpuset_change_task_cs_mpol(task, cs->mempolicy); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); } @@ -2441,6 +2454,140 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, return retval; } +#ifdef CONFIG_NUMA + +/*update the mpol nodemask based on cpuset nodemask*/ +static inline void update_cs_mpol_nodemask(struct cpuset *cs, struct mempolicy *mpol) +{ + nodemask_t newmems; + + if (!mpol) + return; + nodes_and(newmems, cs->effective_mems, mpol->w.user_nodemask); + + /*If no intersection between effective_mems and mpol user_nodemask, + *use effective_mems to allocate mems. + */ + if (!nodes_weight(newmems)) + newmems = cs->effective_mems; + mpol_rebind_policy(mpol, &newmems); +} + +/*update cpuset policy for task*/ +static void cpuset_change_task_cs_mpol(struct task_struct *tsk, + struct mempolicy *mpol) +{ + struct mempolicy *old = NULL; + + task_lock(tsk); + local_irq_disable(); + write_seqcount_begin(&tsk->mems_allowed_seq); + + old = tsk->cs_mpol; + tsk->cs_mpol = mpol; + mpol_get(mpol); + tsk->il_prev = 0; + + write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); + task_unlock(tsk); + mpol_put(old); +} + +static void update_tasks_cs_mpol(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) + cpuset_change_task_cs_mpol(task, cs->mempolicy); + css_task_iter_end(&it); +} + +/* change cpuset mempolicy */ +static ssize_t cpuset_mpol_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mempolicy *mpol, *old = NULL; + struct cpuset *cs = css_cs(of_css(of)); + nodemask_t cs_allowed; + int err = -ENODEV; + + css_get(&cs->css); + kernfs_break_active_protection(of->kn); + percpu_down_write(&cpuset_rwsem); + + if (!is_cpuset_online(cs)) + goto out_unlock; + + buf = strstrip(buf); + err = mpol_parse_str(buf, &mpol); + + if (err) { + err = -EINVAL; + goto out_unlock; + } + + spin_lock_irq(&callback_lock); + old = cs->mempolicy; + update_cs_mpol_nodemask(cs, mpol); + cs->mempolicy = mpol; + spin_unlock_irq(&callback_lock); + + update_tasks_cs_mpol(cs); + +out_unlock: + percpu_up_write(&cpuset_rwsem); + kernfs_unbreak_active_protection(of->kn); + css_put(&cs->css); + + if (old) { + /*Wait for outstanding programs to complete.*/ + synchronize_rcu(); + mpol_put(old); + } + return err ?: nbytes; +} + +/*show cpuset mempolicy*/ +static int cpuset_mpol_show(struct seq_file *seq, void *v) +{ + char buffer[64]; + int ret = 0; + struct mempolicy *mpol; + struct cpuset *cs = css_cs(seq_css(seq)); + + memset(buffer, 0, sizeof(buffer)); + spin_lock_irq(&callback_lock); + mpol = cs->mempolicy; + + if (!mpol || mpol->mode == MPOL_DEFAULT) + goto out_unlock; + + mpol_to_str(buffer, sizeof(buffer), mpol); + seq_printf(seq, buffer); + seq_putc(seq, '\n'); + +out_unlock: + spin_unlock_irq(&callback_lock); + return ret; +} + +#else +static void cpuset_change_task_cs_mpol(struct task_struct *tsk, + struct mempolicy *mpol) +{ +} + +static inline void update_cs_mpol_nodemask(struct cpuset *cs, + struct mempolicy *mpol) +{ +} + +#endif + /* * Common handling for a write to a "cpus" or "mems" file. */ @@ -2678,7 +2825,14 @@ static struct cftype legacy_files[] = { .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_MEMLIST, }, - +#ifdef CONFIG_NUMA + { + .name = "mems_policy", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpuset_mpol_show, + .write = cpuset_mpol_write, + }, +#endif { .name = "cpu_exclusive", .read_u64 = cpuset_read_u64, @@ -2786,7 +2940,14 @@ static struct cftype dfl_files[] = { .seq_show = cpuset_common_seq_show, .private = FILE_EFFECTIVE_MEMLIST, }, - +#ifdef CONFIG_NUMA + { + .name = "mems.policy", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpuset_mpol_show, + .write = cpuset_mpol_write, + }, +#endif { .name = "cpus.partition", .seq_show = sched_partition_show, @@ -2835,6 +2996,21 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + if (IS_ENABLED(CONFIG_NUMA)) { + struct cpuset *pcs = css_cs(parent_css); + struct mempolicy *new; + + /*Inherit mempolicy from parent.*/ + spin_lock_irq(&callback_lock); + new = mpol_dup(pcs->mempolicy); + + if (IS_ERR(new)) + new = NULL; + + cs->mempolicy = new; + spin_unlock_irq(&callback_lock); + } + /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); diff --git a/kernel/fork.c b/kernel/fork.c index 90c85b17bf69..3f695449e2a5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2190,6 +2190,7 @@ static __latent_entropy struct task_struct *copy_process( p->mempolicy = NULL; goto bad_fork_cleanup_delayacct; } + mpol_get(p->cs_mpol); /*ref cpuset mempolicy*/ #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b73d3248d976..144f79887df9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -158,9 +158,15 @@ int numa_map_to_online_node(int node) } EXPORT_SYMBOL_GPL(numa_map_to_online_node); +static inline struct mempolicy *task_or_cs_mpol(struct task_struct *p) +{ + return p->mempolicy ? + p->mempolicy : p->cs_mpol; +} + struct mempolicy *get_task_policy(struct task_struct *p) { - struct mempolicy *pol = p->mempolicy; + struct mempolicy *pol = task_or_cs_mpol(p); int node; if (pol) @@ -349,7 +355,7 @@ static void mpol_rebind_preferred(struct mempolicy *pol, * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) +void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; @@ -1895,7 +1901,7 @@ unsigned int mempolicy_slab_node(void) if (!in_task()) return node; - policy = current->mempolicy; + policy = task_or_cs_mpol(current); if (!policy) return node; @@ -2043,7 +2049,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) return false; task_lock(current); - mempolicy = current->mempolicy; + mempolicy = task_or_cs_mpol(current); switch (mempolicy->mode) { case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: @@ -2633,13 +2639,16 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long */ void mpol_put_task_policy(struct task_struct *task) { - struct mempolicy *pol; + struct mempolicy *pol, *cs_pol; task_lock(task); pol = task->mempolicy; + cs_pol = task->cs_mpol; + task->cs_mpol = NULL; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); + mpol_put(cs_pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) @@ -2957,8 +2966,7 @@ static const char * const policy_modes[] = [MPOL_PREFERRED_MANY] = "prefer (many)", }; - -#ifdef CONFIG_TMPFS +#if defined(CONFIG_TMPFS) || defined(CONFIG_NUMA) /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse @@ -3054,6 +3062,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; + } else { + /*use static mode_flags in default*/ + mode_flags |= MPOL_F_STATIC_NODES; } new = mpol_new(mode, mode_flags, &nodes);