Message ID | 20200403205930.1707-6-alex.kogan@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add NUMA-awareness to qspinlock | expand |
On 4/3/20 4:59 PM, Alex Kogan wrote: > Prohibit moving certain threads (e.g., in irq and nmi contexts) > to the secondary queue. Those prioritized threads will always stay > in the primary queue, and so will have a shorter wait time for the lock. > > Signed-off-by: Alex Kogan <alex.kogan@oracle.com> > Reviewed-by: Steve Sistare <steven.sistare@oracle.com> > Reviewed-by: Waiman Long <longman@redhat.com> > --- > kernel/locking/qspinlock_cna.h | 30 ++++++++++++++++++++++++++---- > 1 file changed, 26 insertions(+), 4 deletions(-) > > diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h > index e3180f6f5cdc..b004ce6882b6 100644 > --- a/kernel/locking/qspinlock_cna.h > +++ b/kernel/locking/qspinlock_cna.h > @@ -4,6 +4,7 @@ > #endif > > #include <linux/topology.h> > +#include <linux/sched/rt.h> > > /* > * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock). > @@ -41,6 +42,9 @@ > * lock is passed to the next thread in the primary queue. To avoid starvation > * of threads in the secondary queue, those threads are moved back to the head > * of the primary queue after a certain number of intra-node lock hand-offs. > + * Lastly, certain threads (e.g., in irq and nmi contexts) are given > + * preferential treatment -- the scan stops when such a thread is found, > + * effectively never moving those threads into the secondary queue. > * > * For more details, see https://arxiv.org/abs/1810.05600. > * > @@ -50,7 +54,7 @@ > > struct cna_node { > struct mcs_spinlock mcs; > - int numa_node; > + int numa_node; /* use LSB for priority */ > u32 encoded_tail; /* self */ > u32 partial_order; /* encoded tail or enum val */ > u32 intra_count; > @@ -79,7 +83,7 @@ static void __init cna_init_nodes_per_cpu(unsigned int cpu) > for (i = 0; i < MAX_NODES; i++) { > struct cna_node *cn = (struct cna_node *)grab_mcs_node(base, i); > > - cn->numa_node = numa_node; > + cn->numa_node = numa_node << 1; > cn->encoded_tail = encode_tail(cpu, i); > /* > * make sure @encoded_tail is not confused with other valid > @@ -110,6 +114,14 @@ static int __init cna_init_nodes(void) > > static __always_inline void cna_init_node(struct mcs_spinlock *node) > { > + /* > + * Set the priority bit in @numa_node for threads that should not > + * be moved to the secondary queue. > + */ > + bool priority = !in_task() || irqs_disabled() || rt_task(current); > + ((struct cna_node *)node)->numa_node = > + (((struct cna_node *)node)->numa_node & ~1) | priority; > + > ((struct cna_node *)node)->intra_count = 0; > } > > @@ -243,12 +255,16 @@ static u32 cna_order_queue(struct mcs_spinlock *node, > { > struct cna_node *cni = (struct cna_node *)READ_ONCE(iter->next); > struct cna_node *cn = (struct cna_node *)node; > - int nid = cn->numa_node; > + int nid = cn->numa_node >> 1; > struct cna_node *last; > > /* find any next waiter on 'our' NUMA node */ > for (last = cn; > - cni && cni->numa_node != nid; > + /* > + * iterate as long as the current node is not priorizied and > + * does not run on 'our' NUMA node > + */ > + cni && !(cni->numa_node & 0x1) && (cni->numa_node >> 1) != nid; > last = cni, cni = (struct cna_node *)READ_ONCE(cni->mcs.next)) > ; > > @@ -258,6 +274,12 @@ static u32 cna_order_queue(struct mcs_spinlock *node, > if (last != cn) /* did we skip any waiters? */ > cna_splice_tail(node, node->next, (struct mcs_spinlock *)last); > > + /* > + * We return LOCAL_WAITER_FOUND here even if we stopped the scan because > + * of a prioritized waiter. That waiter will get the lock next even if > + * it runs on a different NUMA node, but this is what we wanted when we > + * prioritized it. > + */ > return LOCAL_WAITER_FOUND; > } > Sorry for the late review as I was swamped with other tasks earlier. It is good to have a patch to handle lock waiters that shouldn't be delayed, the current way of using bit 0 of numa_node to indicate that is kind of hackery. Also it may not be a good idea to change the current node id like that. I have a patch (attached) that can handle these issues. What do you think about it? Cheers, Longman
diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h index e3180f6f5cdc..b004ce6882b6 100644 --- a/kernel/locking/qspinlock_cna.h +++ b/kernel/locking/qspinlock_cna.h @@ -4,6 +4,7 @@ #endif #include <linux/topology.h> +#include <linux/sched/rt.h> /* * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock). @@ -41,6 +42,9 @@ * lock is passed to the next thread in the primary queue. To avoid starvation * of threads in the secondary queue, those threads are moved back to the head * of the primary queue after a certain number of intra-node lock hand-offs. + * Lastly, certain threads (e.g., in irq and nmi contexts) are given + * preferential treatment -- the scan stops when such a thread is found, + * effectively never moving those threads into the secondary queue. * * For more details, see https://arxiv.org/abs/1810.05600. * @@ -50,7 +54,7 @@ struct cna_node { struct mcs_spinlock mcs; - int numa_node; + int numa_node; /* use LSB for priority */ u32 encoded_tail; /* self */ u32 partial_order; /* encoded tail or enum val */ u32 intra_count; @@ -79,7 +83,7 @@ static void __init cna_init_nodes_per_cpu(unsigned int cpu) for (i = 0; i < MAX_NODES; i++) { struct cna_node *cn = (struct cna_node *)grab_mcs_node(base, i); - cn->numa_node = numa_node; + cn->numa_node = numa_node << 1; cn->encoded_tail = encode_tail(cpu, i); /* * make sure @encoded_tail is not confused with other valid @@ -110,6 +114,14 @@ static int __init cna_init_nodes(void) static __always_inline void cna_init_node(struct mcs_spinlock *node) { + /* + * Set the priority bit in @numa_node for threads that should not + * be moved to the secondary queue. + */ + bool priority = !in_task() || irqs_disabled() || rt_task(current); + ((struct cna_node *)node)->numa_node = + (((struct cna_node *)node)->numa_node & ~1) | priority; + ((struct cna_node *)node)->intra_count = 0; } @@ -243,12 +255,16 @@ static u32 cna_order_queue(struct mcs_spinlock *node, { struct cna_node *cni = (struct cna_node *)READ_ONCE(iter->next); struct cna_node *cn = (struct cna_node *)node; - int nid = cn->numa_node; + int nid = cn->numa_node >> 1; struct cna_node *last; /* find any next waiter on 'our' NUMA node */ for (last = cn; - cni && cni->numa_node != nid; + /* + * iterate as long as the current node is not priorizied and + * does not run on 'our' NUMA node + */ + cni && !(cni->numa_node & 0x1) && (cni->numa_node >> 1) != nid; last = cni, cni = (struct cna_node *)READ_ONCE(cni->mcs.next)) ; @@ -258,6 +274,12 @@ static u32 cna_order_queue(struct mcs_spinlock *node, if (last != cn) /* did we skip any waiters? */ cna_splice_tail(node, node->next, (struct mcs_spinlock *)last); + /* + * We return LOCAL_WAITER_FOUND here even if we stopped the scan because + * of a prioritized waiter. That waiter will get the lock next even if + * it runs on a different NUMA node, but this is what we wanted when we + * prioritized it. + */ return LOCAL_WAITER_FOUND; }