@@ -254,6 +254,8 @@ MPOL_WEIGHTED_INTERLEAVE
This mode operates the same as MPOL_INTERLEAVE, except that
interleaving behavior is executed based on weights set in
/sys/kernel/mm/mempolicy/weighted_interleave/
+ when configured to utilize global weights, or based on task-local
+ weights configured with set_mempolicy2(2) or mbind2(2).
Weighted interleave allocations pages on nodes according to
their weight. For example if nodes [0,1] are weighted [5,2]
@@ -261,6 +263,13 @@ MPOL_WEIGHTED_INTERLEAVE
2 pages allocated on node1. This can better distribute data
according to bandwidth on heterogeneous memory systems.
+ When utilizing task-local weights, weights are not rebalanced
+ in the event of a task migration. If a weight has not been
+ explicitly set for a node set in the new nodemask, the
+ value of that weight defaults to "1". For this reason, if
+ migrations are expected or possible, users should consider
+ utilizing global interleave weights.
+
NUMA memory policy supports the following optional mode flags:
MPOL_F_STATIC_NODES
@@ -516,6 +525,7 @@ Extended Mempolicy Arguments::
__u64 addr; /* get_mempolicy2: policy address */
__s32 policy_node; /* get_mempolicy2: policy node information */
__s32 addr_node; /* get_mempolicy2: memory range policy */
+ __aligned_u64 il_weights; /* u8 buf of size pol_maxnodes */
};
The extended mempolicy argument structure is defined to allow the mempolicy
@@ -58,6 +58,7 @@ struct mempolicy {
/* Weighted interleave settings */
struct {
unsigned char cur_weight;
+ unsigned char weights[MAX_NUMNODES];
} wil;
};
@@ -73,6 +74,7 @@ struct mempolicy_args {
unsigned long addr; /* get: vma address */
int addr_node; /* get: node the address belongs to */
int home_node; /* mbind: use MPOL_MF_HOME_NODE */
+ unsigned char *il_weights; /* for mode MPOL_WEIGHTED_INTERLEAVE */
};
/*
@@ -37,6 +37,7 @@ struct mpol_args {
__u64 addr;
__s32 policy_node; /* get_mempolicy: policy node info */
__s32 addr_node; /* get_mempolicy: memory range policy */
+ __aligned_u64 il_weights; /* size: pol_maxnodes * sizeof(char) */
};
/* Flags for set_mempolicy */
@@ -77,6 +78,7 @@ struct mpol_args {
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
+#define MPOL_F_GWEIGHT (1 << 5) /* Utilize global weights */
/*
* These bit locations are exposed in the vm.zone_reclaim_mode sysctl
@@ -271,6 +271,7 @@ static struct mempolicy *mpol_new(struct mempolicy_args *args)
unsigned short mode = args->mode;
unsigned short flags = args->mode_flags;
nodemask_t *nodes = args->policy_nodes;
+ int node;
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
@@ -297,6 +298,19 @@ static struct mempolicy *mpol_new(struct mempolicy_args *args)
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
+ } else if (mode == MPOL_WEIGHTED_INTERLEAVE) {
+ /* weighted interleave requires a nodemask and weights > 0 */
+ if (nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ if (args->il_weights) {
+ node = first_node(*nodes);
+ while (node != MAX_NUMNODES) {
+ if (!args->il_weights[node])
+ return ERR_PTR(-EINVAL);
+ node = next_node(node, *nodes);
+ }
+ } else if (!(args->mode_flags & MPOL_F_GWEIGHT))
+ return ERR_PTR(-EINVAL);
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
@@ -309,6 +323,16 @@ static struct mempolicy *mpol_new(struct mempolicy_args *args)
policy->home_node = NUMA_NO_NODE;
policy->wil.cur_weight = 0;
policy->home_node = args->home_node;
+ if (policy->mode == MPOL_WEIGHTED_INTERLEAVE && args->il_weights) {
+ policy->wil.cur_weight = 0;
+ /* Minimum weight value is always 1 */
+ memset(policy->wil.weights, 1, MAX_NUMNODES);
+ node = first_node(*nodes);
+ while (node != MAX_NUMNODES) {
+ policy->wil.weights[node] = args->il_weights[node];
+ node = next_node(node, *nodes);
+ }
+ }
return policy;
}
@@ -1518,6 +1542,9 @@ static long kernel_mbind(unsigned long start, unsigned long len,
if (err)
return err;
+ if (mode & MPOL_WEIGHTED_INTERLEAVE)
+ mode_flags |= MPOL_F_GWEIGHT;
+
memset(&margs, 0, sizeof(margs));
margs.mode = lmode;
margs.mode_flags = mode_flags;
@@ -1611,6 +1638,8 @@ SYSCALL_DEFINE5(mbind2, const struct iovec __user *, vec, size_t, vlen,
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
+ unsigned char weights[MAX_NUMNODES];
+ unsigned char *weights_ptr;
int err;
if (!vec || !vlen)
@@ -1648,6 +1677,20 @@ SYSCALL_DEFINE5(mbind2, const struct iovec __user *, vec, size_t, vlen,
} else
margs.policy_nodes = NULL;
+ if (kargs.mode == MPOL_WEIGHTED_INTERLEAVE) {
+ weights_ptr = u64_to_user_ptr(kargs.il_weights);
+ err = copy_struct_from_user(&weights,
+ sizeof(weights),
+ weights_ptr,
+ kargs.pol_maxnodes);
+ if (err)
+ return err;
+ margs.il_weights = weights;
+ } else {
+ margs.il_weights = NULL;
+ flags |= MPOL_F_GWEIGHT;
+ }
+
/* For each address range in vector, do_mbind */
err = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
&iter);
@@ -1686,6 +1729,9 @@ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
if (err)
return err;
+ if (mode & MPOL_WEIGHTED_INTERLEAVE)
+ mode_flags |= MPOL_F_GWEIGHT;
+
memset(&args, 0, sizeof(args));
args.mode = lmode;
args.mode_flags = mode_flags;
@@ -1709,6 +1755,8 @@ SYSCALL_DEFINE3(set_mempolicy2, struct mpol_args __user *, uargs, size_t, usize,
int err;
nodemask_t policy_nodemask;
unsigned long __user *nodes_ptr;
+ unsigned char weights[MAX_NUMNODES];
+ unsigned char __user *weights_ptr;
if (flags)
return -EINVAL;
@@ -1734,6 +1782,20 @@ SYSCALL_DEFINE3(set_mempolicy2, struct mpol_args __user *, uargs, size_t, usize,
} else
margs.policy_nodes = NULL;
+ if (kargs.mode == MPOL_WEIGHTED_INTERLEAVE && kargs.il_weights) {
+ weights_ptr = u64_to_user_ptr(kargs.il_weights);
+ err = copy_struct_from_user(weights,
+ sizeof(weights),
+ weights_ptr,
+ kargs.pol_maxnodes);
+ if (err)
+ return err;
+ margs.il_weights = weights;
+ } else {
+ margs.il_weights = NULL;
+ flags |= MPOL_F_GWEIGHT;
+ }
+
return do_set_mempolicy(&margs);
}
@@ -1935,6 +1997,8 @@ SYSCALL_DEFINE3(get_mempolicy2, struct mpol_args __user *, uargs, size_t, usize,
int err;
nodemask_t policy_nodemask;
unsigned long __user *nodes_ptr;
+ unsigned char *weights_ptr;
+ unsigned char weights[MAX_NUMNODES];
err = copy_struct_from_user(&kargs, sizeof(kargs), uargs, usize);
if (err)
@@ -1951,6 +2015,9 @@ SYSCALL_DEFINE3(get_mempolicy2, struct mpol_args __user *, uargs, size_t, usize,
&policy_nodemask);
}
+ if (kargs.il_weights)
+ margs.il_weights = weights;
+
margs.policy_nodes = kargs.pol_nodes ? &policy_nodemask : NULL;
if (flags & MPOL_F_ADDR) {
margs.addr = kargs.addr;
@@ -1971,6 +2038,13 @@ SYSCALL_DEFINE3(get_mempolicy2, struct mpol_args __user *, uargs, size_t, usize,
margs.policy_nodes);
}
+ if (kargs.il_weights) {
+ weights_ptr = u64_to_user_ptr(kargs.il_weights);
+ err = copy_to_user(weights_ptr, weights, kargs.pol_maxnodes);
+ if (err)
+ return err;
+ }
+
return copy_to_user(uargs, &kargs, usize) ? -EFAULT : 0;
}
@@ -2087,13 +2161,18 @@ static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
{
unsigned int next;
struct task_struct *me = current;
+ unsigned char next_weight;
next = next_node_in(me->il_prev, policy->nodes);
if (next == MAX_NUMNODES)
return next;
- if (!policy->wil.cur_weight)
- policy->wil.cur_weight = iw_table[next];
+ if (!policy->wil.cur_weight) {
+ next_weight = (policy->flags & MPOL_F_GWEIGHT) ?
+ iw_table[next] :
+ policy->wil.weights[next];
+ policy->wil.cur_weight = next_weight ? next_weight : 1;
+ }
policy->wil.cur_weight--;
if (!policy->wil.cur_weight)
@@ -2167,6 +2246,7 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
nodemask_t nodemask = pol->nodes;
unsigned int target, weight_total = 0;
int nid;
+ unsigned char *pol_weights;
unsigned char weights[MAX_NUMNODES];
unsigned char weight;
@@ -2178,8 +2258,13 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
return nid;
/* Then collect weights on stack and calculate totals */
+ if (pol->flags & MPOL_F_GWEIGHT)
+ pol_weights = iw_table;
+ else
+ pol_weights = pol->wil.weights;
+
for_each_node_mask(nid, nodemask) {
- weight = iw_table[nid];
+ weight = pol_weights[nid];
weight_total += weight;
weights[nid] = weight;
}
@@ -2577,6 +2662,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
unsigned long nr_allocated;
unsigned long rounds;
unsigned long node_pages, delta;
+ unsigned char *pol_weights;
unsigned char weight;
unsigned char weights[MAX_NUMNODES];
unsigned int weight_total;
@@ -2590,9 +2676,14 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
nnodes = nodes_weight(nodes);
+ if (pol->flags & MPOL_F_GWEIGHT)
+ pol_weights = iw_table;
+ else
+ pol_weights = pol->wil.weights;
+
/* Collect weights and save them on stack so they don't change */
for_each_node_mask(node, nodes) {
- weight = iw_table[node];
+ weight = pol_weights[node];
weight_total += weight;
weights[node] = weight;
}
@@ -3117,6 +3208,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
int ret;
struct mempolicy_args margs;
+ unsigned char weights[MAX_NUMNODES];
sp->root = RB_ROOT; /* empty tree == default mempolicy */
rwlock_init(&sp->lock);
@@ -3134,6 +3226,11 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
margs.mode_flags = mpol->flags;
margs.policy_nodes = &mpol->w.user_nodemask;
margs.home_node = NUMA_NO_NODE;
+ if (margs.mode == MPOL_WEIGHTED_INTERLEAVE &&
+ !(margs.mode_flags & MPOL_F_GWEIGHT)) {
+ memcpy(weights, mpol->wil.weights, sizeof(weights));
+ margs.il_weights = weights;
+ }
/* contextualize the tmpfs mount point mempolicy to this file */
npol = mpol_new(&margs);