diff mbox series

[RFC,11/14] arm64: Move the ASID allocator code in a separate file

Message ID 20190321163623.20219-12-julien.grall@arm.com (mailing list archive)
State RFC
Headers show
Series kvm/arm: Align the VMID allocation with the arm64 ASID one | expand

Commit Message

Julien Grall March 21, 2019, 4:36 p.m. UTC
We will want to re-use the ASID allocator in a separate context (e.g
allocating VMID). So move the code in a new file.

The function asid_check_context has been moved in the header as a static
inline function because we want to avoid add a branch when checking if the
ASID is still valid.

Signed-off-by: Julien Grall <julien.grall@arm.com>

---

This code will be used in the virt code for allocating VMID. I am not
entirely sure where to place it. Lib could potentially be a good place but I
am not entirely convinced the algo as it is could be used by other
architecture.

Looking at x86, it seems that it will not be possible to re-use because
the number of PCID (aka ASID) could be smaller than the number of CPUs.
See commit message 10af6235e0d327d42e1bad974385197817923dc1 "x86/mm:
Implement PCID based optimization: try to preserve old TLB entries using
PCI".
---
 arch/arm64/include/asm/asid.h |  77 ++++++++++++++
 arch/arm64/lib/Makefile       |   2 +
 arch/arm64/lib/asid.c         | 185 +++++++++++++++++++++++++++++++++
 arch/arm64/mm/context.c       | 235 +-----------------------------------------
 4 files changed, 267 insertions(+), 232 deletions(-)
 create mode 100644 arch/arm64/include/asm/asid.h
 create mode 100644 arch/arm64/lib/asid.c

Comments

Julien Grall June 5, 2019, 4:56 p.m. UTC | #1
Hi,

I am CCing RISC-V folks to see if there are an interest to share the code.

@RISC-V: I noticed you are discussing about importing a version of ASID 
allocator in RISC-V. At a first look, the code looks quite similar. Would the 
library below helps you?

Cheers,

On 21/03/2019 16:36, Julien Grall wrote:
> We will want to re-use the ASID allocator in a separate context (e.g
> allocating VMID). So move the code in a new file.
> 
> The function asid_check_context has been moved in the header as a static
> inline function because we want to avoid add a branch when checking if the
> ASID is still valid.
> 
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> 
> ---
> 
> This code will be used in the virt code for allocating VMID. I am not
> entirely sure where to place it. Lib could potentially be a good place but I
> am not entirely convinced the algo as it is could be used by other
> architecture.
> 
> Looking at x86, it seems that it will not be possible to re-use because
> the number of PCID (aka ASID) could be smaller than the number of CPUs.
> See commit message 10af6235e0d327d42e1bad974385197817923dc1 "x86/mm:
> Implement PCID based optimization: try to preserve old TLB entries using
> PCI".
> ---
>   arch/arm64/include/asm/asid.h |  77 ++++++++++++++
>   arch/arm64/lib/Makefile       |   2 +
>   arch/arm64/lib/asid.c         | 185 +++++++++++++++++++++++++++++++++
>   arch/arm64/mm/context.c       | 235 +-----------------------------------------
>   4 files changed, 267 insertions(+), 232 deletions(-)
>   create mode 100644 arch/arm64/include/asm/asid.h
>   create mode 100644 arch/arm64/lib/asid.c
> 
> diff --git a/arch/arm64/include/asm/asid.h b/arch/arm64/include/asm/asid.h
> new file mode 100644
> index 000000000000..bb62b587f37f
> --- /dev/null
> +++ b/arch/arm64/include/asm/asid.h
> @@ -0,0 +1,77 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __ASM_ASM_ASID_H
> +#define __ASM_ASM_ASID_H
> +
> +#include <linux/atomic.h>
> +#include <linux/compiler.h>
> +#include <linux/cpumask.h>
> +#include <linux/percpu.h>
> +#include <linux/spinlock.h>
> +
> +struct asid_info
> +{
> +	atomic64_t	generation;
> +	unsigned long	*map;
> +	atomic64_t __percpu	*active;
> +	u64 __percpu		*reserved;
> +	u32			bits;
> +	/* Lock protecting the structure */
> +	raw_spinlock_t		lock;
> +	/* Which CPU requires context flush on next call */
> +	cpumask_t		flush_pending;
> +	/* Number of ASID allocated by context (shift value) */
> +	unsigned int		ctxt_shift;
> +	/* Callback to locally flush the context. */
> +	void			(*flush_cpu_ctxt_cb)(void);
> +};
> +
> +#define NUM_ASIDS(info)			(1UL << ((info)->bits))
> +#define NUM_CTXT_ASIDS(info)		(NUM_ASIDS(info) >> (info)->ctxt_shift)
> +
> +#define active_asid(info, cpu)	*per_cpu_ptr((info)->active, cpu)
> +
> +void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> +		      unsigned int cpu);
> +
> +/*
> + * Check the ASID is still valid for the context. If not generate a new ASID.
> + *
> + * @pasid: Pointer to the current ASID batch
> + * @cpu: current CPU ID. Must have been acquired throught get_cpu()
> + */
> +static inline void asid_check_context(struct asid_info *info,
> +				      atomic64_t *pasid, unsigned int cpu)
> +{
> +	u64 asid, old_active_asid;
> +
> +	asid = atomic64_read(pasid);
> +
> +	/*
> +	 * The memory ordering here is subtle.
> +	 * If our active_asid is non-zero and the ASID matches the current
> +	 * generation, then we update the active_asid entry with a relaxed
> +	 * cmpxchg. Racing with a concurrent rollover means that either:
> +	 *
> +	 * - We get a zero back from the cmpxchg and end up waiting on the
> +	 *   lock. Taking the lock synchronises with the rollover and so
> +	 *   we are forced to see the updated generation.
> +	 *
> +	 * - We get a valid ASID back from the cmpxchg, which means the
> +	 *   relaxed xchg in flush_context will treat us as reserved
> +	 *   because atomic RmWs are totally ordered for a given location.
> +	 */
> +	old_active_asid = atomic64_read(&active_asid(info, cpu));
> +	if (old_active_asid &&
> +	    !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
> +	    atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
> +				     old_active_asid, asid))
> +		return;
> +
> +	asid_new_context(info, pasid, cpu);
> +}
> +
> +int asid_allocator_init(struct asid_info *info,
> +			u32 bits, unsigned int asid_per_ctxt,
> +			void (*flush_cpu_ctxt_cb)(void));
> +
> +#endif
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index 5540a1638baf..720df5ee2aa2 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -5,6 +5,8 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\
>   		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
>   		   strchr.o strrchr.o tishift.o
>   
> +lib-y		+= asid.o
> +
>   ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
>   obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
>   CFLAGS_REMOVE_xor-neon.o	+= -mgeneral-regs-only
> diff --git a/arch/arm64/lib/asid.c b/arch/arm64/lib/asid.c
> new file mode 100644
> index 000000000000..72b71bfb32be
> --- /dev/null
> +++ b/arch/arm64/lib/asid.c
> @@ -0,0 +1,185 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Generic ASID allocator.
> + *
> + * Based on arch/arm/mm/context.c
> + *
> + * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved.
> + * Copyright (C) 2012 ARM Ltd.
> + */
> +
> +#include <linux/slab.h>
> +
> +#include <asm/asid.h>
> +
> +#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
> +
> +#define ASID_MASK(info)			(~GENMASK((info)->bits - 1, 0))
> +#define ASID_FIRST_VERSION(info)	(1UL << ((info)->bits))
> +
> +#define asid2idx(info, asid)		(((asid) & ~ASID_MASK(info)) >> (info)->ctxt_shift)
> +#define idx2asid(info, idx)		(((idx) << (info)->ctxt_shift) & ~ASID_MASK(info))
> +
> +static void flush_context(struct asid_info *info)
> +{
> +	int i;
> +	u64 asid;
> +
> +	/* Update the list of reserved ASIDs and the ASID bitmap. */
> +	bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
> +
> +	for_each_possible_cpu(i) {
> +		asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
> +		/*
> +		 * If this CPU has already been through a
> +		 * rollover, but hasn't run another task in
> +		 * the meantime, we must preserve its reserved
> +		 * ASID, as this is the only trace we have of
> +		 * the process it is still running.
> +		 */
> +		if (asid == 0)
> +			asid = reserved_asid(info, i);
> +		__set_bit(asid2idx(info, asid), info->map);
> +		reserved_asid(info, i) = asid;
> +	}
> +
> +	/*
> +	 * Queue a TLB invalidation for each CPU to perform on next
> +	 * context-switch
> +	 */
> +	cpumask_setall(&info->flush_pending);
> +}
> +
> +static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
> +				       u64 newasid)
> +{
> +	int cpu;
> +	bool hit = false;
> +
> +	/*
> +	 * Iterate over the set of reserved ASIDs looking for a match.
> +	 * If we find one, then we can update our mm to use newasid
> +	 * (i.e. the same ASID in the current generation) but we can't
> +	 * exit the loop early, since we need to ensure that all copies
> +	 * of the old ASID are updated to reflect the mm. Failure to do
> +	 * so could result in us missing the reserved ASID in a future
> +	 * generation.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		if (reserved_asid(info, cpu) == asid) {
> +			hit = true;
> +			reserved_asid(info, cpu) = newasid;
> +		}
> +	}
> +
> +	return hit;
> +}
> +
> +static u64 new_context(struct asid_info *info, atomic64_t *pasid)
> +{
> +	static u32 cur_idx = 1;
> +	u64 asid = atomic64_read(pasid);
> +	u64 generation = atomic64_read(&info->generation);
> +
> +	if (asid != 0) {
> +		u64 newasid = generation | (asid & ~ASID_MASK(info));
> +
> +		/*
> +		 * If our current ASID was active during a rollover, we
> +		 * can continue to use it and this was just a false alarm.
> +		 */
> +		if (check_update_reserved_asid(info, asid, newasid))
> +			return newasid;
> +
> +		/*
> +		 * We had a valid ASID in a previous life, so try to re-use
> +		 * it if possible.
> +		 */
> +		if (!__test_and_set_bit(asid2idx(info, asid), info->map))
> +			return newasid;
> +	}
> +
> +	/*
> +	 * Allocate a free ASID. If we can't find one, take a note of the
> +	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
> +	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
> +	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
> +	 * pairs.
> +	 */
> +	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
> +	if (asid != NUM_CTXT_ASIDS(info))
> +		goto set_asid;
> +
> +	/* We're out of ASIDs, so increment the global generation count */
> +	generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
> +						 &info->generation);
> +	flush_context(info);
> +
> +	/* We have more ASIDs than CPUs, so this will always succeed */
> +	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
> +
> +set_asid:
> +	__set_bit(asid, info->map);
> +	cur_idx = asid;
> +	return idx2asid(info, asid) | generation;
> +}
> +
> +/*
> + * Generate a new ASID for the context.
> + *
> + * @pasid: Pointer to the current ASID batch allocated. It will be updated
> + * with the new ASID batch.
> + * @cpu: current CPU ID. Must have been acquired through get_cpu()
> + */
> +void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> +		      unsigned int cpu)
> +{
> +	unsigned long flags;
> +	u64 asid;
> +
> +	raw_spin_lock_irqsave(&info->lock, flags);
> +	/* Check that our ASID belongs to the current generation. */
> +	asid = atomic64_read(pasid);
> +	if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
> +		asid = new_context(info, pasid);
> +		atomic64_set(pasid, asid);
> +	}
> +
> +	if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
> +		info->flush_cpu_ctxt_cb();
> +
> +	atomic64_set(&active_asid(info, cpu), asid);
> +	raw_spin_unlock_irqrestore(&info->lock, flags);
> +}
> +
> +/*
> + * Initialize the ASID allocator
> + *
> + * @info: Pointer to the asid allocator structure
> + * @bits: Number of ASIDs available
> + * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
> + * allocated contiguously for a given context. This value should be a power of
> + * 2.
> + */
> +int asid_allocator_init(struct asid_info *info,
> +			u32 bits, unsigned int asid_per_ctxt,
> +			void (*flush_cpu_ctxt_cb)(void))
> +{
> +	info->bits = bits;
> +	info->ctxt_shift = ilog2(asid_per_ctxt);
> +	info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
> +	/*
> +	 * Expect allocation after rollover to fail if we don't have at least
> +	 * one more ASID than CPUs. ASID #0 is always reserved.
> +	 */
> +	WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
> +	atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
> +	info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
> +			    sizeof(*info->map), GFP_KERNEL);
> +	if (!info->map)
> +		return -ENOMEM;
> +
> +	raw_spin_lock_init(&info->lock);
> +
> +	return 0;
> +}
> diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
> index 678a57b77c91..95ee7711a2ef 100644
> --- a/arch/arm64/mm/context.c
> +++ b/arch/arm64/mm/context.c
> @@ -22,47 +22,22 @@
>   #include <linux/slab.h>
>   #include <linux/mm.h>
>   
> +#include <asm/asid.h>
>   #include <asm/cpufeature.h>
>   #include <asm/mmu_context.h>
>   #include <asm/smp.h>
>   #include <asm/tlbflush.h>
>   
> -struct asid_info
> -{
> -	atomic64_t	generation;
> -	unsigned long	*map;
> -	atomic64_t __percpu	*active;
> -	u64 __percpu		*reserved;
> -	u32			bits;
> -	raw_spinlock_t		lock;
> -	/* Which CPU requires context flush on next call */
> -	cpumask_t		flush_pending;
> -	/* Number of ASID allocated by context (shift value) */
> -	unsigned int		ctxt_shift;
> -	/* Callback to locally flush the context. */
> -	void			(*flush_cpu_ctxt_cb)(void);
> -} asid_info;
> -
> -#define active_asid(info, cpu)	*per_cpu_ptr((info)->active, cpu)
> -#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
> -
>   static DEFINE_PER_CPU(atomic64_t, active_asids);
>   static DEFINE_PER_CPU(u64, reserved_asids);
>   
> -#define ASID_MASK(info)			(~GENMASK((info)->bits - 1, 0))
> -#define NUM_ASIDS(info)			(1UL << ((info)->bits))
> -
> -#define ASID_FIRST_VERSION(info)	NUM_ASIDS(info)
> -
>   #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
>   #define ASID_PER_CONTEXT		2
>   #else
>   #define ASID_PER_CONTEXT		1
>   #endif
>   
> -#define NUM_CTXT_ASIDS(info)		(NUM_ASIDS(info) >> (info)->ctxt_shift)
> -#define asid2idx(info, asid)		(((asid) & ~ASID_MASK(info)) >> (info)->ctxt_shift)
> -#define idx2asid(info, idx)		(((idx) << (info)->ctxt_shift) & ~ASID_MASK(info))
> +struct asid_info asid_info;
>   
>   /* Get the ASIDBits supported by the current CPU */
>   static u32 get_cpu_asid_bits(void)
> @@ -102,178 +77,6 @@ void verify_cpu_asid_bits(void)
>   	}
>   }
>   
> -static void flush_context(struct asid_info *info)
> -{
> -	int i;
> -	u64 asid;
> -
> -	/* Update the list of reserved ASIDs and the ASID bitmap. */
> -	bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
> -
> -	for_each_possible_cpu(i) {
> -		asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
> -		/*
> -		 * If this CPU has already been through a
> -		 * rollover, but hasn't run another task in
> -		 * the meantime, we must preserve its reserved
> -		 * ASID, as this is the only trace we have of
> -		 * the process it is still running.
> -		 */
> -		if (asid == 0)
> -			asid = reserved_asid(info, i);
> -		__set_bit(asid2idx(info, asid), info->map);
> -		reserved_asid(info, i) = asid;
> -	}
> -
> -	/*
> -	 * Queue a TLB invalidation for each CPU to perform on next
> -	 * context-switch
> -	 */
> -	cpumask_setall(&info->flush_pending);
> -}
> -
> -static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
> -				       u64 newasid)
> -{
> -	int cpu;
> -	bool hit = false;
> -
> -	/*
> -	 * Iterate over the set of reserved ASIDs looking for a match.
> -	 * If we find one, then we can update our mm to use newasid
> -	 * (i.e. the same ASID in the current generation) but we can't
> -	 * exit the loop early, since we need to ensure that all copies
> -	 * of the old ASID are updated to reflect the mm. Failure to do
> -	 * so could result in us missing the reserved ASID in a future
> -	 * generation.
> -	 */
> -	for_each_possible_cpu(cpu) {
> -		if (reserved_asid(info, cpu) == asid) {
> -			hit = true;
> -			reserved_asid(info, cpu) = newasid;
> -		}
> -	}
> -
> -	return hit;
> -}
> -
> -static u64 new_context(struct asid_info *info, atomic64_t *pasid)
> -{
> -	static u32 cur_idx = 1;
> -	u64 asid = atomic64_read(pasid);
> -	u64 generation = atomic64_read(&info->generation);
> -
> -	if (asid != 0) {
> -		u64 newasid = generation | (asid & ~ASID_MASK(info));
> -
> -		/*
> -		 * If our current ASID was active during a rollover, we
> -		 * can continue to use it and this was just a false alarm.
> -		 */
> -		if (check_update_reserved_asid(info, asid, newasid))
> -			return newasid;
> -
> -		/*
> -		 * We had a valid ASID in a previous life, so try to re-use
> -		 * it if possible.
> -		 */
> -		if (!__test_and_set_bit(asid2idx(info, asid), info->map))
> -			return newasid;
> -	}
> -
> -	/*
> -	 * Allocate a free ASID. If we can't find one, take a note of the
> -	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
> -	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
> -	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
> -	 * pairs.
> -	 */
> -	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
> -	if (asid != NUM_CTXT_ASIDS(info))
> -		goto set_asid;
> -
> -	/* We're out of ASIDs, so increment the global generation count */
> -	generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
> -						 &info->generation);
> -	flush_context(info);
> -
> -	/* We have more ASIDs than CPUs, so this will always succeed */
> -	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
> -
> -set_asid:
> -	__set_bit(asid, info->map);
> -	cur_idx = asid;
> -	return idx2asid(info, asid) | generation;
> -}
> -
> -static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> -			     unsigned int cpu);
> -
> -/*
> - * Check the ASID is still valid for the context. If not generate a new ASID.
> - *
> - * @pasid: Pointer to the current ASID batch
> - * @cpu: current CPU ID. Must have been acquired throught get_cpu()
> - */
> -static void asid_check_context(struct asid_info *info,
> -			       atomic64_t *pasid, unsigned int cpu)
> -{
> -	u64 asid, old_active_asid;
> -
> -	asid = atomic64_read(pasid);
> -
> -	/*
> -	 * The memory ordering here is subtle.
> -	 * If our active_asid is non-zero and the ASID matches the current
> -	 * generation, then we update the active_asid entry with a relaxed
> -	 * cmpxchg. Racing with a concurrent rollover means that either:
> -	 *
> -	 * - We get a zero back from the cmpxchg and end up waiting on the
> -	 *   lock. Taking the lock synchronises with the rollover and so
> -	 *   we are forced to see the updated generation.
> -	 *
> -	 * - We get a valid ASID back from the cmpxchg, which means the
> -	 *   relaxed xchg in flush_context will treat us as reserved
> -	 *   because atomic RmWs are totally ordered for a given location.
> -	 */
> -	old_active_asid = atomic64_read(&active_asid(info, cpu));
> -	if (old_active_asid &&
> -	    !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
> -	    atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
> -				     old_active_asid, asid))
> -		return;
> -
> -	asid_new_context(info, pasid, cpu);
> -}
> -
> -/*
> - * Generate a new ASID for the context.
> - *
> - * @pasid: Pointer to the current ASID batch allocated. It will be updated
> - * with the new ASID batch.
> - * @cpu: current CPU ID. Must have been acquired through get_cpu()
> - */
> -static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> -			     unsigned int cpu)
> -{
> -	unsigned long flags;
> -	u64 asid;
> -
> -	raw_spin_lock_irqsave(&info->lock, flags);
> -	/* Check that our ASID belongs to the current generation. */
> -	asid = atomic64_read(pasid);
> -	if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
> -		asid = new_context(info, pasid);
> -		atomic64_set(pasid, asid);
> -	}
> -
> -	if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
> -		info->flush_cpu_ctxt_cb();
> -
> -	atomic64_set(&active_asid(info, cpu), asid);
> -	raw_spin_unlock_irqrestore(&info->lock, flags);
> -}
> -
>   void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>   {
>   	if (system_supports_cnp())
> @@ -305,38 +108,6 @@ static void asid_flush_cpu_ctxt(void)
>   	local_flush_tlb_all();
>   }
>   
> -/*
> - * Initialize the ASID allocator
> - *
> - * @info: Pointer to the asid allocator structure
> - * @bits: Number of ASIDs available
> - * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
> - * allocated contiguously for a given context. This value should be a power of
> - * 2.
> - */
> -static int asid_allocator_init(struct asid_info *info,
> -			       u32 bits, unsigned int asid_per_ctxt,
> -			       void (*flush_cpu_ctxt_cb)(void))
> -{
> -	info->bits = bits;
> -	info->ctxt_shift = ilog2(asid_per_ctxt);
> -	info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
> -	/*
> -	 * Expect allocation after rollover to fail if we don't have at least
> -	 * one more ASID than CPUs. ASID #0 is always reserved.
> -	 */
> -	WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
> -	atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
> -	info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
> -			    sizeof(*info->map), GFP_KERNEL);
> -	if (!info->map)
> -		return -ENOMEM;
> -
> -	raw_spin_lock_init(&info->lock);
> -
> -	return 0;
> -}
> -
>   static int asids_init(void)
>   {
>   	u32 bits = get_cpu_asid_bits();
> @@ -344,7 +115,7 @@ static int asids_init(void)
>   	if (!asid_allocator_init(&asid_info, bits, ASID_PER_CONTEXT,
>   				 asid_flush_cpu_ctxt))
>   		panic("Unable to initialize ASID allocator for %lu ASIDs\n",
> -		      1UL << bits);
> +		      NUM_ASIDS(&asid_info));
>   
>   	asid_info.active = &active_asids;
>   	asid_info.reserved = &reserved_asids;
>
Palmer Dabbelt June 5, 2019, 8:41 p.m. UTC | #2
On Wed, 05 Jun 2019 09:56:03 PDT (-0700), julien.grall@arm.com wrote:
> Hi,
>
> I am CCing RISC-V folks to see if there are an interest to share the code.
>
> @RISC-V: I noticed you are discussing about importing a version of ASID
> allocator in RISC-V. At a first look, the code looks quite similar. Would the
> library below helps you?

Thanks!  I didn't look that closely at the original patches because the
argument against them was just "we don't have any way to test this".
Unfortunately, we don't have the constraint that there are more ASIDs than CPUs
in the system.  As a result I don't think we can use this ASID allocation
strategy.

>
> Cheers,
>
> On 21/03/2019 16:36, Julien Grall wrote:
>> We will want to re-use the ASID allocator in a separate context (e.g
>> allocating VMID). So move the code in a new file.
>>
>> The function asid_check_context has been moved in the header as a static
>> inline function because we want to avoid add a branch when checking if the
>> ASID is still valid.
>>
>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>>
>> ---
>>
>> This code will be used in the virt code for allocating VMID. I am not
>> entirely sure where to place it. Lib could potentially be a good place but I
>> am not entirely convinced the algo as it is could be used by other
>> architecture.
>>
>> Looking at x86, it seems that it will not be possible to re-use because
>> the number of PCID (aka ASID) could be smaller than the number of CPUs.
>> See commit message 10af6235e0d327d42e1bad974385197817923dc1 "x86/mm:
>> Implement PCID based optimization: try to preserve old TLB entries using
>> PCI".
>> ---
>>   arch/arm64/include/asm/asid.h |  77 ++++++++++++++
>>   arch/arm64/lib/Makefile       |   2 +
>>   arch/arm64/lib/asid.c         | 185 +++++++++++++++++++++++++++++++++
>>   arch/arm64/mm/context.c       | 235 +-----------------------------------------
>>   4 files changed, 267 insertions(+), 232 deletions(-)
>>   create mode 100644 arch/arm64/include/asm/asid.h
>>   create mode 100644 arch/arm64/lib/asid.c
>>
>> diff --git a/arch/arm64/include/asm/asid.h b/arch/arm64/include/asm/asid.h
>> new file mode 100644
>> index 000000000000..bb62b587f37f
>> --- /dev/null
>> +++ b/arch/arm64/include/asm/asid.h
>> @@ -0,0 +1,77 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef __ASM_ASM_ASID_H
>> +#define __ASM_ASM_ASID_H
>> +
>> +#include <linux/atomic.h>
>> +#include <linux/compiler.h>
>> +#include <linux/cpumask.h>
>> +#include <linux/percpu.h>
>> +#include <linux/spinlock.h>
>> +
>> +struct asid_info
>> +{
>> +	atomic64_t	generation;
>> +	unsigned long	*map;
>> +	atomic64_t __percpu	*active;
>> +	u64 __percpu		*reserved;
>> +	u32			bits;
>> +	/* Lock protecting the structure */
>> +	raw_spinlock_t		lock;
>> +	/* Which CPU requires context flush on next call */
>> +	cpumask_t		flush_pending;
>> +	/* Number of ASID allocated by context (shift value) */
>> +	unsigned int		ctxt_shift;
>> +	/* Callback to locally flush the context. */
>> +	void			(*flush_cpu_ctxt_cb)(void);
>> +};
>> +
>> +#define NUM_ASIDS(info)			(1UL << ((info)->bits))
>> +#define NUM_CTXT_ASIDS(info)		(NUM_ASIDS(info) >> (info)->ctxt_shift)
>> +
>> +#define active_asid(info, cpu)	*per_cpu_ptr((info)->active, cpu)
>> +
>> +void asid_new_context(struct asid_info *info, atomic64_t *pasid,
>> +		      unsigned int cpu);
>> +
>> +/*
>> + * Check the ASID is still valid for the context. If not generate a new ASID.
>> + *
>> + * @pasid: Pointer to the current ASID batch
>> + * @cpu: current CPU ID. Must have been acquired throught get_cpu()
>> + */
>> +static inline void asid_check_context(struct asid_info *info,
>> +				      atomic64_t *pasid, unsigned int cpu)
>> +{
>> +	u64 asid, old_active_asid;
>> +
>> +	asid = atomic64_read(pasid);
>> +
>> +	/*
>> +	 * The memory ordering here is subtle.
>> +	 * If our active_asid is non-zero and the ASID matches the current
>> +	 * generation, then we update the active_asid entry with a relaxed
>> +	 * cmpxchg. Racing with a concurrent rollover means that either:
>> +	 *
>> +	 * - We get a zero back from the cmpxchg and end up waiting on the
>> +	 *   lock. Taking the lock synchronises with the rollover and so
>> +	 *   we are forced to see the updated generation.
>> +	 *
>> +	 * - We get a valid ASID back from the cmpxchg, which means the
>> +	 *   relaxed xchg in flush_context will treat us as reserved
>> +	 *   because atomic RmWs are totally ordered for a given location.
>> +	 */
>> +	old_active_asid = atomic64_read(&active_asid(info, cpu));
>> +	if (old_active_asid &&
>> +	    !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
>> +	    atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
>> +				     old_active_asid, asid))
>> +		return;
>> +
>> +	asid_new_context(info, pasid, cpu);
>> +}
>> +
>> +int asid_allocator_init(struct asid_info *info,
>> +			u32 bits, unsigned int asid_per_ctxt,
>> +			void (*flush_cpu_ctxt_cb)(void));
>> +
>> +#endif
>> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
>> index 5540a1638baf..720df5ee2aa2 100644
>> --- a/arch/arm64/lib/Makefile
>> +++ b/arch/arm64/lib/Makefile
>> @@ -5,6 +5,8 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\
>>   		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
>>   		   strchr.o strrchr.o tishift.o
>>
>> +lib-y		+= asid.o
>> +
>>   ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
>>   obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
>>   CFLAGS_REMOVE_xor-neon.o	+= -mgeneral-regs-only
>> diff --git a/arch/arm64/lib/asid.c b/arch/arm64/lib/asid.c
>> new file mode 100644
>> index 000000000000..72b71bfb32be
>> --- /dev/null
>> +++ b/arch/arm64/lib/asid.c
>> @@ -0,0 +1,185 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Generic ASID allocator.
>> + *
>> + * Based on arch/arm/mm/context.c
>> + *
>> + * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved.
>> + * Copyright (C) 2012 ARM Ltd.
>> + */
>> +
>> +#include <linux/slab.h>
>> +
>> +#include <asm/asid.h>
>> +
>> +#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
>> +
>> +#define ASID_MASK(info)			(~GENMASK((info)->bits - 1, 0))
>> +#define ASID_FIRST_VERSION(info)	(1UL << ((info)->bits))
>> +
>> +#define asid2idx(info, asid)		(((asid) & ~ASID_MASK(info)) >> (info)->ctxt_shift)
>> +#define idx2asid(info, idx)		(((idx) << (info)->ctxt_shift) & ~ASID_MASK(info))
>> +
>> +static void flush_context(struct asid_info *info)
>> +{
>> +	int i;
>> +	u64 asid;
>> +
>> +	/* Update the list of reserved ASIDs and the ASID bitmap. */
>> +	bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
>> +
>> +	for_each_possible_cpu(i) {
>> +		asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
>> +		/*
>> +		 * If this CPU has already been through a
>> +		 * rollover, but hasn't run another task in
>> +		 * the meantime, we must preserve its reserved
>> +		 * ASID, as this is the only trace we have of
>> +		 * the process it is still running.
>> +		 */
>> +		if (asid == 0)
>> +			asid = reserved_asid(info, i);
>> +		__set_bit(asid2idx(info, asid), info->map);
>> +		reserved_asid(info, i) = asid;
>> +	}
>> +
>> +	/*
>> +	 * Queue a TLB invalidation for each CPU to perform on next
>> +	 * context-switch
>> +	 */
>> +	cpumask_setall(&info->flush_pending);
>> +}
>> +
>> +static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
>> +				       u64 newasid)
>> +{
>> +	int cpu;
>> +	bool hit = false;
>> +
>> +	/*
>> +	 * Iterate over the set of reserved ASIDs looking for a match.
>> +	 * If we find one, then we can update our mm to use newasid
>> +	 * (i.e. the same ASID in the current generation) but we can't
>> +	 * exit the loop early, since we need to ensure that all copies
>> +	 * of the old ASID are updated to reflect the mm. Failure to do
>> +	 * so could result in us missing the reserved ASID in a future
>> +	 * generation.
>> +	 */
>> +	for_each_possible_cpu(cpu) {
>> +		if (reserved_asid(info, cpu) == asid) {
>> +			hit = true;
>> +			reserved_asid(info, cpu) = newasid;
>> +		}
>> +	}
>> +
>> +	return hit;
>> +}
>> +
>> +static u64 new_context(struct asid_info *info, atomic64_t *pasid)
>> +{
>> +	static u32 cur_idx = 1;
>> +	u64 asid = atomic64_read(pasid);
>> +	u64 generation = atomic64_read(&info->generation);
>> +
>> +	if (asid != 0) {
>> +		u64 newasid = generation | (asid & ~ASID_MASK(info));
>> +
>> +		/*
>> +		 * If our current ASID was active during a rollover, we
>> +		 * can continue to use it and this was just a false alarm.
>> +		 */
>> +		if (check_update_reserved_asid(info, asid, newasid))
>> +			return newasid;
>> +
>> +		/*
>> +		 * We had a valid ASID in a previous life, so try to re-use
>> +		 * it if possible.
>> +		 */
>> +		if (!__test_and_set_bit(asid2idx(info, asid), info->map))
>> +			return newasid;
>> +	}
>> +
>> +	/*
>> +	 * Allocate a free ASID. If we can't find one, take a note of the
>> +	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
>> +	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
>> +	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
>> +	 * pairs.
>> +	 */
>> +	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
>> +	if (asid != NUM_CTXT_ASIDS(info))
>> +		goto set_asid;
>> +
>> +	/* We're out of ASIDs, so increment the global generation count */
>> +	generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
>> +						 &info->generation);
>> +	flush_context(info);
>> +
>> +	/* We have more ASIDs than CPUs, so this will always succeed */
>> +	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
>> +
>> +set_asid:
>> +	__set_bit(asid, info->map);
>> +	cur_idx = asid;
>> +	return idx2asid(info, asid) | generation;
>> +}
>> +
>> +/*
>> + * Generate a new ASID for the context.
>> + *
>> + * @pasid: Pointer to the current ASID batch allocated. It will be updated
>> + * with the new ASID batch.
>> + * @cpu: current CPU ID. Must have been acquired through get_cpu()
>> + */
>> +void asid_new_context(struct asid_info *info, atomic64_t *pasid,
>> +		      unsigned int cpu)
>> +{
>> +	unsigned long flags;
>> +	u64 asid;
>> +
>> +	raw_spin_lock_irqsave(&info->lock, flags);
>> +	/* Check that our ASID belongs to the current generation. */
>> +	asid = atomic64_read(pasid);
>> +	if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
>> +		asid = new_context(info, pasid);
>> +		atomic64_set(pasid, asid);
>> +	}
>> +
>> +	if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
>> +		info->flush_cpu_ctxt_cb();
>> +
>> +	atomic64_set(&active_asid(info, cpu), asid);
>> +	raw_spin_unlock_irqrestore(&info->lock, flags);
>> +}
>> +
>> +/*
>> + * Initialize the ASID allocator
>> + *
>> + * @info: Pointer to the asid allocator structure
>> + * @bits: Number of ASIDs available
>> + * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
>> + * allocated contiguously for a given context. This value should be a power of
>> + * 2.
>> + */
>> +int asid_allocator_init(struct asid_info *info,
>> +			u32 bits, unsigned int asid_per_ctxt,
>> +			void (*flush_cpu_ctxt_cb)(void))
>> +{
>> +	info->bits = bits;
>> +	info->ctxt_shift = ilog2(asid_per_ctxt);
>> +	info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
>> +	/*
>> +	 * Expect allocation after rollover to fail if we don't have at least
>> +	 * one more ASID than CPUs. ASID #0 is always reserved.
>> +	 */
>> +	WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
>> +	atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
>> +	info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
>> +			    sizeof(*info->map), GFP_KERNEL);
>> +	if (!info->map)
>> +		return -ENOMEM;
>> +
>> +	raw_spin_lock_init(&info->lock);
>> +
>> +	return 0;
>> +}
>> diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
>> index 678a57b77c91..95ee7711a2ef 100644
>> --- a/arch/arm64/mm/context.c
>> +++ b/arch/arm64/mm/context.c
>> @@ -22,47 +22,22 @@
>>   #include <linux/slab.h>
>>   #include <linux/mm.h>
>>
>> +#include <asm/asid.h>
>>   #include <asm/cpufeature.h>
>>   #include <asm/mmu_context.h>
>>   #include <asm/smp.h>
>>   #include <asm/tlbflush.h>
>>
>> -struct asid_info
>> -{
>> -	atomic64_t	generation;
>> -	unsigned long	*map;
>> -	atomic64_t __percpu	*active;
>> -	u64 __percpu		*reserved;
>> -	u32			bits;
>> -	raw_spinlock_t		lock;
>> -	/* Which CPU requires context flush on next call */
>> -	cpumask_t		flush_pending;
>> -	/* Number of ASID allocated by context (shift value) */
>> -	unsigned int		ctxt_shift;
>> -	/* Callback to locally flush the context. */
>> -	void			(*flush_cpu_ctxt_cb)(void);
>> -} asid_info;
>> -
>> -#define active_asid(info, cpu)	*per_cpu_ptr((info)->active, cpu)
>> -#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
>> -
>>   static DEFINE_PER_CPU(atomic64_t, active_asids);
>>   static DEFINE_PER_CPU(u64, reserved_asids);
>>
>> -#define ASID_MASK(info)			(~GENMASK((info)->bits - 1, 0))
>> -#define NUM_ASIDS(info)			(1UL << ((info)->bits))
>> -
>> -#define ASID_FIRST_VERSION(info)	NUM_ASIDS(info)
>> -
>>   #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
>>   #define ASID_PER_CONTEXT		2
>>   #else
>>   #define ASID_PER_CONTEXT		1
>>   #endif
>>
>> -#define NUM_CTXT_ASIDS(info)		(NUM_ASIDS(info) >> (info)->ctxt_shift)
>> -#define asid2idx(info, asid)		(((asid) & ~ASID_MASK(info)) >> (info)->ctxt_shift)
>> -#define idx2asid(info, idx)		(((idx) << (info)->ctxt_shift) & ~ASID_MASK(info))
>> +struct asid_info asid_info;
>>
>>   /* Get the ASIDBits supported by the current CPU */
>>   static u32 get_cpu_asid_bits(void)
>> @@ -102,178 +77,6 @@ void verify_cpu_asid_bits(void)
>>   	}
>>   }
>>
>> -static void flush_context(struct asid_info *info)
>> -{
>> -	int i;
>> -	u64 asid;
>> -
>> -	/* Update the list of reserved ASIDs and the ASID bitmap. */
>> -	bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
>> -
>> -	for_each_possible_cpu(i) {
>> -		asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
>> -		/*
>> -		 * If this CPU has already been through a
>> -		 * rollover, but hasn't run another task in
>> -		 * the meantime, we must preserve its reserved
>> -		 * ASID, as this is the only trace we have of
>> -		 * the process it is still running.
>> -		 */
>> -		if (asid == 0)
>> -			asid = reserved_asid(info, i);
>> -		__set_bit(asid2idx(info, asid), info->map);
>> -		reserved_asid(info, i) = asid;
>> -	}
>> -
>> -	/*
>> -	 * Queue a TLB invalidation for each CPU to perform on next
>> -	 * context-switch
>> -	 */
>> -	cpumask_setall(&info->flush_pending);
>> -}
>> -
>> -static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
>> -				       u64 newasid)
>> -{
>> -	int cpu;
>> -	bool hit = false;
>> -
>> -	/*
>> -	 * Iterate over the set of reserved ASIDs looking for a match.
>> -	 * If we find one, then we can update our mm to use newasid
>> -	 * (i.e. the same ASID in the current generation) but we can't
>> -	 * exit the loop early, since we need to ensure that all copies
>> -	 * of the old ASID are updated to reflect the mm. Failure to do
>> -	 * so could result in us missing the reserved ASID in a future
>> -	 * generation.
>> -	 */
>> -	for_each_possible_cpu(cpu) {
>> -		if (reserved_asid(info, cpu) == asid) {
>> -			hit = true;
>> -			reserved_asid(info, cpu) = newasid;
>> -		}
>> -	}
>> -
>> -	return hit;
>> -}
>> -
>> -static u64 new_context(struct asid_info *info, atomic64_t *pasid)
>> -{
>> -	static u32 cur_idx = 1;
>> -	u64 asid = atomic64_read(pasid);
>> -	u64 generation = atomic64_read(&info->generation);
>> -
>> -	if (asid != 0) {
>> -		u64 newasid = generation | (asid & ~ASID_MASK(info));
>> -
>> -		/*
>> -		 * If our current ASID was active during a rollover, we
>> -		 * can continue to use it and this was just a false alarm.
>> -		 */
>> -		if (check_update_reserved_asid(info, asid, newasid))
>> -			return newasid;
>> -
>> -		/*
>> -		 * We had a valid ASID in a previous life, so try to re-use
>> -		 * it if possible.
>> -		 */
>> -		if (!__test_and_set_bit(asid2idx(info, asid), info->map))
>> -			return newasid;
>> -	}
>> -
>> -	/*
>> -	 * Allocate a free ASID. If we can't find one, take a note of the
>> -	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
>> -	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
>> -	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
>> -	 * pairs.
>> -	 */
>> -	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
>> -	if (asid != NUM_CTXT_ASIDS(info))
>> -		goto set_asid;
>> -
>> -	/* We're out of ASIDs, so increment the global generation count */
>> -	generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
>> -						 &info->generation);
>> -	flush_context(info);
>> -
>> -	/* We have more ASIDs than CPUs, so this will always succeed */
>> -	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
>> -
>> -set_asid:
>> -	__set_bit(asid, info->map);
>> -	cur_idx = asid;
>> -	return idx2asid(info, asid) | generation;
>> -}
>> -
>> -static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
>> -			     unsigned int cpu);
>> -
>> -/*
>> - * Check the ASID is still valid for the context. If not generate a new ASID.
>> - *
>> - * @pasid: Pointer to the current ASID batch
>> - * @cpu: current CPU ID. Must have been acquired throught get_cpu()
>> - */
>> -static void asid_check_context(struct asid_info *info,
>> -			       atomic64_t *pasid, unsigned int cpu)
>> -{
>> -	u64 asid, old_active_asid;
>> -
>> -	asid = atomic64_read(pasid);
>> -
>> -	/*
>> -	 * The memory ordering here is subtle.
>> -	 * If our active_asid is non-zero and the ASID matches the current
>> -	 * generation, then we update the active_asid entry with a relaxed
>> -	 * cmpxchg. Racing with a concurrent rollover means that either:
>> -	 *
>> -	 * - We get a zero back from the cmpxchg and end up waiting on the
>> -	 *   lock. Taking the lock synchronises with the rollover and so
>> -	 *   we are forced to see the updated generation.
>> -	 *
>> -	 * - We get a valid ASID back from the cmpxchg, which means the
>> -	 *   relaxed xchg in flush_context will treat us as reserved
>> -	 *   because atomic RmWs are totally ordered for a given location.
>> -	 */
>> -	old_active_asid = atomic64_read(&active_asid(info, cpu));
>> -	if (old_active_asid &&
>> -	    !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
>> -	    atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
>> -				     old_active_asid, asid))
>> -		return;
>> -
>> -	asid_new_context(info, pasid, cpu);
>> -}
>> -
>> -/*
>> - * Generate a new ASID for the context.
>> - *
>> - * @pasid: Pointer to the current ASID batch allocated. It will be updated
>> - * with the new ASID batch.
>> - * @cpu: current CPU ID. Must have been acquired through get_cpu()
>> - */
>> -static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
>> -			     unsigned int cpu)
>> -{
>> -	unsigned long flags;
>> -	u64 asid;
>> -
>> -	raw_spin_lock_irqsave(&info->lock, flags);
>> -	/* Check that our ASID belongs to the current generation. */
>> -	asid = atomic64_read(pasid);
>> -	if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
>> -		asid = new_context(info, pasid);
>> -		atomic64_set(pasid, asid);
>> -	}
>> -
>> -	if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
>> -		info->flush_cpu_ctxt_cb();
>> -
>> -	atomic64_set(&active_asid(info, cpu), asid);
>> -	raw_spin_unlock_irqrestore(&info->lock, flags);
>> -}
>> -
>>   void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
>>   {
>>   	if (system_supports_cnp())
>> @@ -305,38 +108,6 @@ static void asid_flush_cpu_ctxt(void)
>>   	local_flush_tlb_all();
>>   }
>>
>> -/*
>> - * Initialize the ASID allocator
>> - *
>> - * @info: Pointer to the asid allocator structure
>> - * @bits: Number of ASIDs available
>> - * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
>> - * allocated contiguously for a given context. This value should be a power of
>> - * 2.
>> - */
>> -static int asid_allocator_init(struct asid_info *info,
>> -			       u32 bits, unsigned int asid_per_ctxt,
>> -			       void (*flush_cpu_ctxt_cb)(void))
>> -{
>> -	info->bits = bits;
>> -	info->ctxt_shift = ilog2(asid_per_ctxt);
>> -	info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
>> -	/*
>> -	 * Expect allocation after rollover to fail if we don't have at least
>> -	 * one more ASID than CPUs. ASID #0 is always reserved.
>> -	 */
>> -	WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
>> -	atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
>> -	info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
>> -			    sizeof(*info->map), GFP_KERNEL);
>> -	if (!info->map)
>> -		return -ENOMEM;
>> -
>> -	raw_spin_lock_init(&info->lock);
>> -
>> -	return 0;
>> -}
>> -
>>   static int asids_init(void)
>>   {
>>   	u32 bits = get_cpu_asid_bits();
>> @@ -344,7 +115,7 @@ static int asids_init(void)
>>   	if (!asid_allocator_init(&asid_info, bits, ASID_PER_CONTEXT,
>>   				 asid_flush_cpu_ctxt))
>>   		panic("Unable to initialize ASID allocator for %lu ASIDs\n",
>> -		      1UL << bits);
>> +		      NUM_ASIDS(&asid_info));
>>
>>   	asid_info.active = &active_asids;
>>   	asid_info.reserved = &reserved_asids;
>>
Gary Guo June 11, 2019, 1:56 a.m. UTC | #3
Hi,

On RISC-V, we can only use ASID if there are more ASIDs than CPUs. If there aren't enough ASIDs (or if there is only 1), then ASID feature is disabled and 0 is used everywhere.

Best,
Gary

> -----Original Message-----
> From: Palmer Dabbelt <palmer@sifive.com>
> Sent: Wednesday, June 5, 2019 21:42
> To: julien.grall@arm.com
> Cc: linux-kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> kvmarm@lists.cs.columbia.edu; aou@eecs.berkeley.edu; Gary Guo
> <gary@garyguo.net>; Atish Patra <Atish.Patra@wdc.com>; Christoph Hellwig
> <hch@infradead.org>; Paul Walmsley <paul.walmsley@sifive.com>;
> rppt@linux.ibm.com; linux-riscv@lists.infradead.org; Anup Patel
> <Anup.Patel@wdc.com>; christoffer.dall@arm.com; james.morse@arm.com;
> marc.zyngier@arm.com; julien.thierry@arm.com; suzuki.poulose@arm.com;
> catalin.marinas@arm.com; Will Deacon <will.deacon@arm.com>
> Subject: Re: [PATCH RFC 11/14] arm64: Move the ASID allocator code in a
> separate file
> 
> On Wed, 05 Jun 2019 09:56:03 PDT (-0700), julien.grall@arm.com wrote:
> > Hi,
> >
> > I am CCing RISC-V folks to see if there are an interest to share the code.
> >
> > @RISC-V: I noticed you are discussing about importing a version of ASID
> > allocator in RISC-V. At a first look, the code looks quite similar. Would the
> > library below helps you?
> 
> Thanks!  I didn't look that closely at the original patches because the
> argument against them was just "we don't have any way to test this".
> Unfortunately, we don't have the constraint that there are more ASIDs than
> CPUs
> in the system.  As a result I don't think we can use this ASID allocation
> strategy.
> 
> >
> > Cheers,
> >
> > On 21/03/2019 16:36, Julien Grall wrote:
> >> We will want to re-use the ASID allocator in a separate context (e.g
> >> allocating VMID). So move the code in a new file.
> >>
> >> The function asid_check_context has been moved in the header as a static
> >> inline function because we want to avoid add a branch when checking if the
> >> ASID is still valid.
> >>
> >> Signed-off-by: Julien Grall <julien.grall@arm.com>
> >>
> >> ---
> >>
> >> This code will be used in the virt code for allocating VMID. I am not
> >> entirely sure where to place it. Lib could potentially be a good place but I
> >> am not entirely convinced the algo as it is could be used by other
> >> architecture.
> >>
> >> Looking at x86, it seems that it will not be possible to re-use because
> >> the number of PCID (aka ASID) could be smaller than the number of CPUs.
> >> See commit message 10af6235e0d327d42e1bad974385197817923dc1
> "x86/mm:
> >> Implement PCID based optimization: try to preserve old TLB entries using
> >> PCI".
> >> ---
> >>   arch/arm64/include/asm/asid.h |  77 ++++++++++++++
> >>   arch/arm64/lib/Makefile       |   2 +
> >>   arch/arm64/lib/asid.c         | 185 +++++++++++++++++++++++++++++++++
> >>   arch/arm64/mm/context.c       | 235 +-----------------------------------------
> >>   4 files changed, 267 insertions(+), 232 deletions(-)
> >>   create mode 100644 arch/arm64/include/asm/asid.h
> >>   create mode 100644 arch/arm64/lib/asid.c
> >>
> >> diff --git a/arch/arm64/include/asm/asid.h b/arch/arm64/include/asm/asid.h
> >> new file mode 100644
> >> index 000000000000..bb62b587f37f
> >> --- /dev/null
> >> +++ b/arch/arm64/include/asm/asid.h
> >> @@ -0,0 +1,77 @@
> >> +/* SPDX-License-Identifier: GPL-2.0 */
> >> +#ifndef __ASM_ASM_ASID_H
> >> +#define __ASM_ASM_ASID_H
> >> +
> >> +#include <linux/atomic.h>
> >> +#include <linux/compiler.h>
> >> +#include <linux/cpumask.h>
> >> +#include <linux/percpu.h>
> >> +#include <linux/spinlock.h>
> >> +
> >> +struct asid_info
> >> +{
> >> +	atomic64_t	generation;
> >> +	unsigned long	*map;
> >> +	atomic64_t __percpu	*active;
> >> +	u64 __percpu		*reserved;
> >> +	u32			bits;
> >> +	/* Lock protecting the structure */
> >> +	raw_spinlock_t		lock;
> >> +	/* Which CPU requires context flush on next call */
> >> +	cpumask_t		flush_pending;
> >> +	/* Number of ASID allocated by context (shift value) */
> >> +	unsigned int		ctxt_shift;
> >> +	/* Callback to locally flush the context. */
> >> +	void			(*flush_cpu_ctxt_cb)(void);
> >> +};
> >> +
> >> +#define NUM_ASIDS(info)			(1UL << ((info)->bits))
> >> +#define NUM_CTXT_ASIDS(info)		(NUM_ASIDS(info) >> (info)-
> >ctxt_shift)
> >> +
> >> +#define active_asid(info, cpu)	*per_cpu_ptr((info)->active, cpu)
> >> +
> >> +void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> >> +		      unsigned int cpu);
> >> +
> >> +/*
> >> + * Check the ASID is still valid for the context. If not generate a new ASID.
> >> + *
> >> + * @pasid: Pointer to the current ASID batch
> >> + * @cpu: current CPU ID. Must have been acquired throught get_cpu()
> >> + */
> >> +static inline void asid_check_context(struct asid_info *info,
> >> +				      atomic64_t *pasid, unsigned int cpu)
> >> +{
> >> +	u64 asid, old_active_asid;
> >> +
> >> +	asid = atomic64_read(pasid);
> >> +
> >> +	/*
> >> +	 * The memory ordering here is subtle.
> >> +	 * If our active_asid is non-zero and the ASID matches the current
> >> +	 * generation, then we update the active_asid entry with a relaxed
> >> +	 * cmpxchg. Racing with a concurrent rollover means that either:
> >> +	 *
> >> +	 * - We get a zero back from the cmpxchg and end up waiting on the
> >> +	 *   lock. Taking the lock synchronises with the rollover and so
> >> +	 *   we are forced to see the updated generation.
> >> +	 *
> >> +	 * - We get a valid ASID back from the cmpxchg, which means the
> >> +	 *   relaxed xchg in flush_context will treat us as reserved
> >> +	 *   because atomic RmWs are totally ordered for a given location.
> >> +	 */
> >> +	old_active_asid = atomic64_read(&active_asid(info, cpu));
> >> +	if (old_active_asid &&
> >> +	    !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
> >> +	    atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
> >> +				     old_active_asid, asid))
> >> +		return;
> >> +
> >> +	asid_new_context(info, pasid, cpu);
> >> +}
> >> +
> >> +int asid_allocator_init(struct asid_info *info,
> >> +			u32 bits, unsigned int asid_per_ctxt,
> >> +			void (*flush_cpu_ctxt_cb)(void));
> >> +
> >> +#endif
> >> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> >> index 5540a1638baf..720df5ee2aa2 100644
> >> --- a/arch/arm64/lib/Makefile
> >> +++ b/arch/arm64/lib/Makefile
> >> @@ -5,6 +5,8 @@ lib-y		:= clear_user.o delay.o
> copy_from_user.o		\
> >>   		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
> >>   		   strchr.o strrchr.o tishift.o
> >>
> >> +lib-y		+= asid.o
> >> +
> >>   ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
> >>   obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
> >>   CFLAGS_REMOVE_xor-neon.o	+= -mgeneral-regs-only
> >> diff --git a/arch/arm64/lib/asid.c b/arch/arm64/lib/asid.c
> >> new file mode 100644
> >> index 000000000000..72b71bfb32be
> >> --- /dev/null
> >> +++ b/arch/arm64/lib/asid.c
> >> @@ -0,0 +1,185 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/*
> >> + * Generic ASID allocator.
> >> + *
> >> + * Based on arch/arm/mm/context.c
> >> + *
> >> + * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved.
> >> + * Copyright (C) 2012 ARM Ltd.
> >> + */
> >> +
> >> +#include <linux/slab.h>
> >> +
> >> +#include <asm/asid.h>
> >> +
> >> +#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
> >> +
> >> +#define ASID_MASK(info)			(~GENMASK((info)->bits - 1, 0))
> >> +#define ASID_FIRST_VERSION(info)	(1UL << ((info)->bits))
> >> +
> >> +#define asid2idx(info, asid)		(((asid) & ~ASID_MASK(info)) >> (info)-
> >ctxt_shift)
> >> +#define idx2asid(info, idx)		(((idx) << (info)->ctxt_shift) &
> ~ASID_MASK(info))
> >> +
> >> +static void flush_context(struct asid_info *info)
> >> +{
> >> +	int i;
> >> +	u64 asid;
> >> +
> >> +	/* Update the list of reserved ASIDs and the ASID bitmap. */
> >> +	bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
> >> +
> >> +	for_each_possible_cpu(i) {
> >> +		asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
> >> +		/*
> >> +		 * If this CPU has already been through a
> >> +		 * rollover, but hasn't run another task in
> >> +		 * the meantime, we must preserve its reserved
> >> +		 * ASID, as this is the only trace we have of
> >> +		 * the process it is still running.
> >> +		 */
> >> +		if (asid == 0)
> >> +			asid = reserved_asid(info, i);
> >> +		__set_bit(asid2idx(info, asid), info->map);
> >> +		reserved_asid(info, i) = asid;
> >> +	}
> >> +
> >> +	/*
> >> +	 * Queue a TLB invalidation for each CPU to perform on next
> >> +	 * context-switch
> >> +	 */
> >> +	cpumask_setall(&info->flush_pending);
> >> +}
> >> +
> >> +static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
> >> +				       u64 newasid)
> >> +{
> >> +	int cpu;
> >> +	bool hit = false;
> >> +
> >> +	/*
> >> +	 * Iterate over the set of reserved ASIDs looking for a match.
> >> +	 * If we find one, then we can update our mm to use newasid
> >> +	 * (i.e. the same ASID in the current generation) but we can't
> >> +	 * exit the loop early, since we need to ensure that all copies
> >> +	 * of the old ASID are updated to reflect the mm. Failure to do
> >> +	 * so could result in us missing the reserved ASID in a future
> >> +	 * generation.
> >> +	 */
> >> +	for_each_possible_cpu(cpu) {
> >> +		if (reserved_asid(info, cpu) == asid) {
> >> +			hit = true;
> >> +			reserved_asid(info, cpu) = newasid;
> >> +		}
> >> +	}
> >> +
> >> +	return hit;
> >> +}
> >> +
> >> +static u64 new_context(struct asid_info *info, atomic64_t *pasid)
> >> +{
> >> +	static u32 cur_idx = 1;
> >> +	u64 asid = atomic64_read(pasid);
> >> +	u64 generation = atomic64_read(&info->generation);
> >> +
> >> +	if (asid != 0) {
> >> +		u64 newasid = generation | (asid & ~ASID_MASK(info));
> >> +
> >> +		/*
> >> +		 * If our current ASID was active during a rollover, we
> >> +		 * can continue to use it and this was just a false alarm.
> >> +		 */
> >> +		if (check_update_reserved_asid(info, asid, newasid))
> >> +			return newasid;
> >> +
> >> +		/*
> >> +		 * We had a valid ASID in a previous life, so try to re-use
> >> +		 * it if possible.
> >> +		 */
> >> +		if (!__test_and_set_bit(asid2idx(info, asid), info->map))
> >> +			return newasid;
> >> +	}
> >> +
> >> +	/*
> >> +	 * Allocate a free ASID. If we can't find one, take a note of the
> >> +	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
> >> +	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
> >> +	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
> >> +	 * pairs.
> >> +	 */
> >> +	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
> >> +	if (asid != NUM_CTXT_ASIDS(info))
> >> +		goto set_asid;
> >> +
> >> +	/* We're out of ASIDs, so increment the global generation count */
> >> +	generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
> >> +						 &info->generation);
> >> +	flush_context(info);
> >> +
> >> +	/* We have more ASIDs than CPUs, so this will always succeed */
> >> +	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
> >> +
> >> +set_asid:
> >> +	__set_bit(asid, info->map);
> >> +	cur_idx = asid;
> >> +	return idx2asid(info, asid) | generation;
> >> +}
> >> +
> >> +/*
> >> + * Generate a new ASID for the context.
> >> + *
> >> + * @pasid: Pointer to the current ASID batch allocated. It will be updated
> >> + * with the new ASID batch.
> >> + * @cpu: current CPU ID. Must have been acquired through get_cpu()
> >> + */
> >> +void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> >> +		      unsigned int cpu)
> >> +{
> >> +	unsigned long flags;
> >> +	u64 asid;
> >> +
> >> +	raw_spin_lock_irqsave(&info->lock, flags);
> >> +	/* Check that our ASID belongs to the current generation. */
> >> +	asid = atomic64_read(pasid);
> >> +	if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
> >> +		asid = new_context(info, pasid);
> >> +		atomic64_set(pasid, asid);
> >> +	}
> >> +
> >> +	if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
> >> +		info->flush_cpu_ctxt_cb();
> >> +
> >> +	atomic64_set(&active_asid(info, cpu), asid);
> >> +	raw_spin_unlock_irqrestore(&info->lock, flags);
> >> +}
> >> +
> >> +/*
> >> + * Initialize the ASID allocator
> >> + *
> >> + * @info: Pointer to the asid allocator structure
> >> + * @bits: Number of ASIDs available
> >> + * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
> >> + * allocated contiguously for a given context. This value should be a power
> of
> >> + * 2.
> >> + */
> >> +int asid_allocator_init(struct asid_info *info,
> >> +			u32 bits, unsigned int asid_per_ctxt,
> >> +			void (*flush_cpu_ctxt_cb)(void))
> >> +{
> >> +	info->bits = bits;
> >> +	info->ctxt_shift = ilog2(asid_per_ctxt);
> >> +	info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
> >> +	/*
> >> +	 * Expect allocation after rollover to fail if we don't have at least
> >> +	 * one more ASID than CPUs. ASID #0 is always reserved.
> >> +	 */
> >> +	WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
> >> +	atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
> >> +	info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
> >> +			    sizeof(*info->map), GFP_KERNEL);
> >> +	if (!info->map)
> >> +		return -ENOMEM;
> >> +
> >> +	raw_spin_lock_init(&info->lock);
> >> +
> >> +	return 0;
> >> +}
> >> diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
> >> index 678a57b77c91..95ee7711a2ef 100644
> >> --- a/arch/arm64/mm/context.c
> >> +++ b/arch/arm64/mm/context.c
> >> @@ -22,47 +22,22 @@
> >>   #include <linux/slab.h>
> >>   #include <linux/mm.h>
> >>
> >> +#include <asm/asid.h>
> >>   #include <asm/cpufeature.h>
> >>   #include <asm/mmu_context.h>
> >>   #include <asm/smp.h>
> >>   #include <asm/tlbflush.h>
> >>
> >> -struct asid_info
> >> -{
> >> -	atomic64_t	generation;
> >> -	unsigned long	*map;
> >> -	atomic64_t __percpu	*active;
> >> -	u64 __percpu		*reserved;
> >> -	u32			bits;
> >> -	raw_spinlock_t		lock;
> >> -	/* Which CPU requires context flush on next call */
> >> -	cpumask_t		flush_pending;
> >> -	/* Number of ASID allocated by context (shift value) */
> >> -	unsigned int		ctxt_shift;
> >> -	/* Callback to locally flush the context. */
> >> -	void			(*flush_cpu_ctxt_cb)(void);
> >> -} asid_info;
> >> -
> >> -#define active_asid(info, cpu)	*per_cpu_ptr((info)->active, cpu)
> >> -#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
> >> -
> >>   static DEFINE_PER_CPU(atomic64_t, active_asids);
> >>   static DEFINE_PER_CPU(u64, reserved_asids);
> >>
> >> -#define ASID_MASK(info)			(~GENMASK((info)->bits - 1, 0))
> >> -#define NUM_ASIDS(info)			(1UL << ((info)->bits))
> >> -
> >> -#define ASID_FIRST_VERSION(info)	NUM_ASIDS(info)
> >> -
> >>   #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> >>   #define ASID_PER_CONTEXT		2
> >>   #else
> >>   #define ASID_PER_CONTEXT		1
> >>   #endif
> >>
> >> -#define NUM_CTXT_ASIDS(info)		(NUM_ASIDS(info) >> (info)-
> >ctxt_shift)
> >> -#define asid2idx(info, asid)		(((asid) & ~ASID_MASK(info)) >> (info)-
> >ctxt_shift)
> >> -#define idx2asid(info, idx)		(((idx) << (info)->ctxt_shift) &
> ~ASID_MASK(info))
> >> +struct asid_info asid_info;
> >>
> >>   /* Get the ASIDBits supported by the current CPU */
> >>   static u32 get_cpu_asid_bits(void)
> >> @@ -102,178 +77,6 @@ void verify_cpu_asid_bits(void)
> >>   	}
> >>   }
> >>
> >> -static void flush_context(struct asid_info *info)
> >> -{
> >> -	int i;
> >> -	u64 asid;
> >> -
> >> -	/* Update the list of reserved ASIDs and the ASID bitmap. */
> >> -	bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
> >> -
> >> -	for_each_possible_cpu(i) {
> >> -		asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
> >> -		/*
> >> -		 * If this CPU has already been through a
> >> -		 * rollover, but hasn't run another task in
> >> -		 * the meantime, we must preserve its reserved
> >> -		 * ASID, as this is the only trace we have of
> >> -		 * the process it is still running.
> >> -		 */
> >> -		if (asid == 0)
> >> -			asid = reserved_asid(info, i);
> >> -		__set_bit(asid2idx(info, asid), info->map);
> >> -		reserved_asid(info, i) = asid;
> >> -	}
> >> -
> >> -	/*
> >> -	 * Queue a TLB invalidation for each CPU to perform on next
> >> -	 * context-switch
> >> -	 */
> >> -	cpumask_setall(&info->flush_pending);
> >> -}
> >> -
> >> -static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
> >> -				       u64 newasid)
> >> -{
> >> -	int cpu;
> >> -	bool hit = false;
> >> -
> >> -	/*
> >> -	 * Iterate over the set of reserved ASIDs looking for a match.
> >> -	 * If we find one, then we can update our mm to use newasid
> >> -	 * (i.e. the same ASID in the current generation) but we can't
> >> -	 * exit the loop early, since we need to ensure that all copies
> >> -	 * of the old ASID are updated to reflect the mm. Failure to do
> >> -	 * so could result in us missing the reserved ASID in a future
> >> -	 * generation.
> >> -	 */
> >> -	for_each_possible_cpu(cpu) {
> >> -		if (reserved_asid(info, cpu) == asid) {
> >> -			hit = true;
> >> -			reserved_asid(info, cpu) = newasid;
> >> -		}
> >> -	}
> >> -
> >> -	return hit;
> >> -}
> >> -
> >> -static u64 new_context(struct asid_info *info, atomic64_t *pasid)
> >> -{
> >> -	static u32 cur_idx = 1;
> >> -	u64 asid = atomic64_read(pasid);
> >> -	u64 generation = atomic64_read(&info->generation);
> >> -
> >> -	if (asid != 0) {
> >> -		u64 newasid = generation | (asid & ~ASID_MASK(info));
> >> -
> >> -		/*
> >> -		 * If our current ASID was active during a rollover, we
> >> -		 * can continue to use it and this was just a false alarm.
> >> -		 */
> >> -		if (check_update_reserved_asid(info, asid, newasid))
> >> -			return newasid;
> >> -
> >> -		/*
> >> -		 * We had a valid ASID in a previous life, so try to re-use
> >> -		 * it if possible.
> >> -		 */
> >> -		if (!__test_and_set_bit(asid2idx(info, asid), info->map))
> >> -			return newasid;
> >> -	}
> >> -
> >> -	/*
> >> -	 * Allocate a free ASID. If we can't find one, take a note of the
> >> -	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
> >> -	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
> >> -	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
> >> -	 * pairs.
> >> -	 */
> >> -	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
> >> -	if (asid != NUM_CTXT_ASIDS(info))
> >> -		goto set_asid;
> >> -
> >> -	/* We're out of ASIDs, so increment the global generation count */
> >> -	generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
> >> -						 &info->generation);
> >> -	flush_context(info);
> >> -
> >> -	/* We have more ASIDs than CPUs, so this will always succeed */
> >> -	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
> >> -
> >> -set_asid:
> >> -	__set_bit(asid, info->map);
> >> -	cur_idx = asid;
> >> -	return idx2asid(info, asid) | generation;
> >> -}
> >> -
> >> -static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> >> -			     unsigned int cpu);
> >> -
> >> -/*
> >> - * Check the ASID is still valid for the context. If not generate a new ASID.
> >> - *
> >> - * @pasid: Pointer to the current ASID batch
> >> - * @cpu: current CPU ID. Must have been acquired throught get_cpu()
> >> - */
> >> -static void asid_check_context(struct asid_info *info,
> >> -			       atomic64_t *pasid, unsigned int cpu)
> >> -{
> >> -	u64 asid, old_active_asid;
> >> -
> >> -	asid = atomic64_read(pasid);
> >> -
> >> -	/*
> >> -	 * The memory ordering here is subtle.
> >> -	 * If our active_asid is non-zero and the ASID matches the current
> >> -	 * generation, then we update the active_asid entry with a relaxed
> >> -	 * cmpxchg. Racing with a concurrent rollover means that either:
> >> -	 *
> >> -	 * - We get a zero back from the cmpxchg and end up waiting on the
> >> -	 *   lock. Taking the lock synchronises with the rollover and so
> >> -	 *   we are forced to see the updated generation.
> >> -	 *
> >> -	 * - We get a valid ASID back from the cmpxchg, which means the
> >> -	 *   relaxed xchg in flush_context will treat us as reserved
> >> -	 *   because atomic RmWs are totally ordered for a given location.
> >> -	 */
> >> -	old_active_asid = atomic64_read(&active_asid(info, cpu));
> >> -	if (old_active_asid &&
> >> -	    !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
> >> -	    atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
> >> -				     old_active_asid, asid))
> >> -		return;
> >> -
> >> -	asid_new_context(info, pasid, cpu);
> >> -}
> >> -
> >> -/*
> >> - * Generate a new ASID for the context.
> >> - *
> >> - * @pasid: Pointer to the current ASID batch allocated. It will be updated
> >> - * with the new ASID batch.
> >> - * @cpu: current CPU ID. Must have been acquired through get_cpu()
> >> - */
> >> -static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> >> -			     unsigned int cpu)
> >> -{
> >> -	unsigned long flags;
> >> -	u64 asid;
> >> -
> >> -	raw_spin_lock_irqsave(&info->lock, flags);
> >> -	/* Check that our ASID belongs to the current generation. */
> >> -	asid = atomic64_read(pasid);
> >> -	if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
> >> -		asid = new_context(info, pasid);
> >> -		atomic64_set(pasid, asid);
> >> -	}
> >> -
> >> -	if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
> >> -		info->flush_cpu_ctxt_cb();
> >> -
> >> -	atomic64_set(&active_asid(info, cpu), asid);
> >> -	raw_spin_unlock_irqrestore(&info->lock, flags);
> >> -}
> >> -
> >>   void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
> >>   {
> >>   	if (system_supports_cnp())
> >> @@ -305,38 +108,6 @@ static void asid_flush_cpu_ctxt(void)
> >>   	local_flush_tlb_all();
> >>   }
> >>
> >> -/*
> >> - * Initialize the ASID allocator
> >> - *
> >> - * @info: Pointer to the asid allocator structure
> >> - * @bits: Number of ASIDs available
> >> - * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
> >> - * allocated contiguously for a given context. This value should be a power
> of
> >> - * 2.
> >> - */
> >> -static int asid_allocator_init(struct asid_info *info,
> >> -			       u32 bits, unsigned int asid_per_ctxt,
> >> -			       void (*flush_cpu_ctxt_cb)(void))
> >> -{
> >> -	info->bits = bits;
> >> -	info->ctxt_shift = ilog2(asid_per_ctxt);
> >> -	info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
> >> -	/*
> >> -	 * Expect allocation after rollover to fail if we don't have at least
> >> -	 * one more ASID than CPUs. ASID #0 is always reserved.
> >> -	 */
> >> -	WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
> >> -	atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
> >> -	info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
> >> -			    sizeof(*info->map), GFP_KERNEL);
> >> -	if (!info->map)
> >> -		return -ENOMEM;
> >> -
> >> -	raw_spin_lock_init(&info->lock);
> >> -
> >> -	return 0;
> >> -}
> >> -
> >>   static int asids_init(void)
> >>   {
> >>   	u32 bits = get_cpu_asid_bits();
> >> @@ -344,7 +115,7 @@ static int asids_init(void)
> >>   	if (!asid_allocator_init(&asid_info, bits, ASID_PER_CONTEXT,
> >>   				 asid_flush_cpu_ctxt))
> >>   		panic("Unable to initialize ASID allocator for %lu ASIDs\n",
> >> -		      1UL << bits);
> >> +		      NUM_ASIDS(&asid_info));
> >>
> >>   	asid_info.active = &active_asids;
> >>   	asid_info.reserved = &reserved_asids;
> >>
Guo Ren June 19, 2019, 8:07 a.m. UTC | #4
Hi Julien,

You forgot CCing C-SKY folks :P

Move arm asid allocator code in a generic one is a agood idea, I've
made a patchset for C-SKY and test is on processing, See:
https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/

If you plan to seperate it into generic one, I could co-work with you.
Or I'll bring asid code into csky subsystem first and you can cleanup
them later.

Best Regards
 Guo Ren

ML: linux-csky@vger.kernel.org

On Thu, Jun 6, 2019 at 12:56 AM Julien Grall <julien.grall@arm.com> wrote:
>
> Hi,
>
> I am CCing RISC-V folks to see if there are an interest to share the code.
>
> @RISC-V: I noticed you are discussing about importing a version of ASID
> allocator in RISC-V. At a first look, the code looks quite similar. Would the
> library below helps you?
>
> Cheers,
>
> On 21/03/2019 16:36, Julien Grall wrote:
> > We will want to re-use the ASID allocator in a separate context (e.g
> > allocating VMID). So move the code in a new file.
> >
> > The function asid_check_context has been moved in the header as a static
> > inline function because we want to avoid add a branch when checking if the
> > ASID is still valid.
> >
> > Signed-off-by: Julien Grall <julien.grall@arm.com>
> >
> > ---
> >
> > This code will be used in the virt code for allocating VMID. I am not
> > entirely sure where to place it. Lib could potentially be a good place but I
> > am not entirely convinced the algo as it is could be used by other
> > architecture.
> >
> > Looking at x86, it seems that it will not be possible to re-use because
> > the number of PCID (aka ASID) could be smaller than the number of CPUs.
> > See commit message 10af6235e0d327d42e1bad974385197817923dc1 "x86/mm:
> > Implement PCID based optimization: try to preserve old TLB entries using
> > PCI".
> > ---
> >   arch/arm64/include/asm/asid.h |  77 ++++++++++++++
> >   arch/arm64/lib/Makefile       |   2 +
> >   arch/arm64/lib/asid.c         | 185 +++++++++++++++++++++++++++++++++
> >   arch/arm64/mm/context.c       | 235 +-----------------------------------------
> >   4 files changed, 267 insertions(+), 232 deletions(-)
> >   create mode 100644 arch/arm64/include/asm/asid.h
> >   create mode 100644 arch/arm64/lib/asid.c
> >
> > diff --git a/arch/arm64/include/asm/asid.h b/arch/arm64/include/asm/asid.h
> > new file mode 100644
> > index 000000000000..bb62b587f37f
> > --- /dev/null
> > +++ b/arch/arm64/include/asm/asid.h
> > @@ -0,0 +1,77 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef __ASM_ASM_ASID_H
> > +#define __ASM_ASM_ASID_H
> > +
> > +#include <linux/atomic.h>
> > +#include <linux/compiler.h>
> > +#include <linux/cpumask.h>
> > +#include <linux/percpu.h>
> > +#include <linux/spinlock.h>
> > +
> > +struct asid_info
> > +{
> > +     atomic64_t      generation;
> > +     unsigned long   *map;
> > +     atomic64_t __percpu     *active;
> > +     u64 __percpu            *reserved;
> > +     u32                     bits;
> > +     /* Lock protecting the structure */
> > +     raw_spinlock_t          lock;
> > +     /* Which CPU requires context flush on next call */
> > +     cpumask_t               flush_pending;
> > +     /* Number of ASID allocated by context (shift value) */
> > +     unsigned int            ctxt_shift;
> > +     /* Callback to locally flush the context. */
> > +     void                    (*flush_cpu_ctxt_cb)(void);
> > +};
> > +
> > +#define NUM_ASIDS(info)                      (1UL << ((info)->bits))
> > +#define NUM_CTXT_ASIDS(info)         (NUM_ASIDS(info) >> (info)->ctxt_shift)
> > +
> > +#define active_asid(info, cpu)       *per_cpu_ptr((info)->active, cpu)
> > +
> > +void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> > +                   unsigned int cpu);
> > +
> > +/*
> > + * Check the ASID is still valid for the context. If not generate a new ASID.
> > + *
> > + * @pasid: Pointer to the current ASID batch
> > + * @cpu: current CPU ID. Must have been acquired throught get_cpu()
> > + */
> > +static inline void asid_check_context(struct asid_info *info,
> > +                                   atomic64_t *pasid, unsigned int cpu)
> > +{
> > +     u64 asid, old_active_asid;
> > +
> > +     asid = atomic64_read(pasid);
> > +
> > +     /*
> > +      * The memory ordering here is subtle.
> > +      * If our active_asid is non-zero and the ASID matches the current
> > +      * generation, then we update the active_asid entry with a relaxed
> > +      * cmpxchg. Racing with a concurrent rollover means that either:
> > +      *
> > +      * - We get a zero back from the cmpxchg and end up waiting on the
> > +      *   lock. Taking the lock synchronises with the rollover and so
> > +      *   we are forced to see the updated generation.
> > +      *
> > +      * - We get a valid ASID back from the cmpxchg, which means the
> > +      *   relaxed xchg in flush_context will treat us as reserved
> > +      *   because atomic RmWs are totally ordered for a given location.
> > +      */
> > +     old_active_asid = atomic64_read(&active_asid(info, cpu));
> > +     if (old_active_asid &&
> > +         !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
> > +         atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
> > +                                  old_active_asid, asid))
> > +             return;
> > +
> > +     asid_new_context(info, pasid, cpu);
> > +}
> > +
> > +int asid_allocator_init(struct asid_info *info,
> > +                     u32 bits, unsigned int asid_per_ctxt,
> > +                     void (*flush_cpu_ctxt_cb)(void));
> > +
> > +#endif
> > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> > index 5540a1638baf..720df5ee2aa2 100644
> > --- a/arch/arm64/lib/Makefile
> > +++ b/arch/arm64/lib/Makefile
> > @@ -5,6 +5,8 @@ lib-y         := clear_user.o delay.o copy_from_user.o                \
> >                  memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
> >                  strchr.o strrchr.o tishift.o
> >
> > +lib-y                += asid.o
> > +
> >   ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
> >   obj-$(CONFIG_XOR_BLOCKS)    += xor-neon.o
> >   CFLAGS_REMOVE_xor-neon.o    += -mgeneral-regs-only
> > diff --git a/arch/arm64/lib/asid.c b/arch/arm64/lib/asid.c
> > new file mode 100644
> > index 000000000000..72b71bfb32be
> > --- /dev/null
> > +++ b/arch/arm64/lib/asid.c
> > @@ -0,0 +1,185 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Generic ASID allocator.
> > + *
> > + * Based on arch/arm/mm/context.c
> > + *
> > + * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved.
> > + * Copyright (C) 2012 ARM Ltd.
> > + */
> > +
> > +#include <linux/slab.h>
> > +
> > +#include <asm/asid.h>
> > +
> > +#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
> > +
> > +#define ASID_MASK(info)                      (~GENMASK((info)->bits - 1, 0))
> > +#define ASID_FIRST_VERSION(info)     (1UL << ((info)->bits))
> > +
> > +#define asid2idx(info, asid)         (((asid) & ~ASID_MASK(info)) >> (info)->ctxt_shift)
> > +#define idx2asid(info, idx)          (((idx) << (info)->ctxt_shift) & ~ASID_MASK(info))
> > +
> > +static void flush_context(struct asid_info *info)
> > +{
> > +     int i;
> > +     u64 asid;
> > +
> > +     /* Update the list of reserved ASIDs and the ASID bitmap. */
> > +     bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
> > +
> > +     for_each_possible_cpu(i) {
> > +             asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
> > +             /*
> > +              * If this CPU has already been through a
> > +              * rollover, but hasn't run another task in
> > +              * the meantime, we must preserve its reserved
> > +              * ASID, as this is the only trace we have of
> > +              * the process it is still running.
> > +              */
> > +             if (asid == 0)
> > +                     asid = reserved_asid(info, i);
> > +             __set_bit(asid2idx(info, asid), info->map);
> > +             reserved_asid(info, i) = asid;
> > +     }
> > +
> > +     /*
> > +      * Queue a TLB invalidation for each CPU to perform on next
> > +      * context-switch
> > +      */
> > +     cpumask_setall(&info->flush_pending);
> > +}
> > +
> > +static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
> > +                                    u64 newasid)
> > +{
> > +     int cpu;
> > +     bool hit = false;
> > +
> > +     /*
> > +      * Iterate over the set of reserved ASIDs looking for a match.
> > +      * If we find one, then we can update our mm to use newasid
> > +      * (i.e. the same ASID in the current generation) but we can't
> > +      * exit the loop early, since we need to ensure that all copies
> > +      * of the old ASID are updated to reflect the mm. Failure to do
> > +      * so could result in us missing the reserved ASID in a future
> > +      * generation.
> > +      */
> > +     for_each_possible_cpu(cpu) {
> > +             if (reserved_asid(info, cpu) == asid) {
> > +                     hit = true;
> > +                     reserved_asid(info, cpu) = newasid;
> > +             }
> > +     }
> > +
> > +     return hit;
> > +}
> > +
> > +static u64 new_context(struct asid_info *info, atomic64_t *pasid)
> > +{
> > +     static u32 cur_idx = 1;
> > +     u64 asid = atomic64_read(pasid);
> > +     u64 generation = atomic64_read(&info->generation);
> > +
> > +     if (asid != 0) {
> > +             u64 newasid = generation | (asid & ~ASID_MASK(info));
> > +
> > +             /*
> > +              * If our current ASID was active during a rollover, we
> > +              * can continue to use it and this was just a false alarm.
> > +              */
> > +             if (check_update_reserved_asid(info, asid, newasid))
> > +                     return newasid;
> > +
> > +             /*
> > +              * We had a valid ASID in a previous life, so try to re-use
> > +              * it if possible.
> > +              */
> > +             if (!__test_and_set_bit(asid2idx(info, asid), info->map))
> > +                     return newasid;
> > +     }
> > +
> > +     /*
> > +      * Allocate a free ASID. If we can't find one, take a note of the
> > +      * currently active ASIDs and mark the TLBs as requiring flushes.  We
> > +      * always count from ASID #2 (index 1), as we use ASID #0 when setting
> > +      * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
> > +      * pairs.
> > +      */
> > +     asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
> > +     if (asid != NUM_CTXT_ASIDS(info))
> > +             goto set_asid;
> > +
> > +     /* We're out of ASIDs, so increment the global generation count */
> > +     generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
> > +                                              &info->generation);
> > +     flush_context(info);
> > +
> > +     /* We have more ASIDs than CPUs, so this will always succeed */
> > +     asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
> > +
> > +set_asid:
> > +     __set_bit(asid, info->map);
> > +     cur_idx = asid;
> > +     return idx2asid(info, asid) | generation;
> > +}
> > +
> > +/*
> > + * Generate a new ASID for the context.
> > + *
> > + * @pasid: Pointer to the current ASID batch allocated. It will be updated
> > + * with the new ASID batch.
> > + * @cpu: current CPU ID. Must have been acquired through get_cpu()
> > + */
> > +void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> > +                   unsigned int cpu)
> > +{
> > +     unsigned long flags;
> > +     u64 asid;
> > +
> > +     raw_spin_lock_irqsave(&info->lock, flags);
> > +     /* Check that our ASID belongs to the current generation. */
> > +     asid = atomic64_read(pasid);
> > +     if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
> > +             asid = new_context(info, pasid);
> > +             atomic64_set(pasid, asid);
> > +     }
> > +
> > +     if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
> > +             info->flush_cpu_ctxt_cb();
> > +
> > +     atomic64_set(&active_asid(info, cpu), asid);
> > +     raw_spin_unlock_irqrestore(&info->lock, flags);
> > +}
> > +
> > +/*
> > + * Initialize the ASID allocator
> > + *
> > + * @info: Pointer to the asid allocator structure
> > + * @bits: Number of ASIDs available
> > + * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
> > + * allocated contiguously for a given context. This value should be a power of
> > + * 2.
> > + */
> > +int asid_allocator_init(struct asid_info *info,
> > +                     u32 bits, unsigned int asid_per_ctxt,
> > +                     void (*flush_cpu_ctxt_cb)(void))
> > +{
> > +     info->bits = bits;
> > +     info->ctxt_shift = ilog2(asid_per_ctxt);
> > +     info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
> > +     /*
> > +      * Expect allocation after rollover to fail if we don't have at least
> > +      * one more ASID than CPUs. ASID #0 is always reserved.
> > +      */
> > +     WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
> > +     atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
> > +     info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
> > +                         sizeof(*info->map), GFP_KERNEL);
> > +     if (!info->map)
> > +             return -ENOMEM;
> > +
> > +     raw_spin_lock_init(&info->lock);
> > +
> > +     return 0;
> > +}
> > diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
> > index 678a57b77c91..95ee7711a2ef 100644
> > --- a/arch/arm64/mm/context.c
> > +++ b/arch/arm64/mm/context.c
> > @@ -22,47 +22,22 @@
> >   #include <linux/slab.h>
> >   #include <linux/mm.h>
> >
> > +#include <asm/asid.h>
> >   #include <asm/cpufeature.h>
> >   #include <asm/mmu_context.h>
> >   #include <asm/smp.h>
> >   #include <asm/tlbflush.h>
> >
> > -struct asid_info
> > -{
> > -     atomic64_t      generation;
> > -     unsigned long   *map;
> > -     atomic64_t __percpu     *active;
> > -     u64 __percpu            *reserved;
> > -     u32                     bits;
> > -     raw_spinlock_t          lock;
> > -     /* Which CPU requires context flush on next call */
> > -     cpumask_t               flush_pending;
> > -     /* Number of ASID allocated by context (shift value) */
> > -     unsigned int            ctxt_shift;
> > -     /* Callback to locally flush the context. */
> > -     void                    (*flush_cpu_ctxt_cb)(void);
> > -} asid_info;
> > -
> > -#define active_asid(info, cpu)       *per_cpu_ptr((info)->active, cpu)
> > -#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
> > -
> >   static DEFINE_PER_CPU(atomic64_t, active_asids);
> >   static DEFINE_PER_CPU(u64, reserved_asids);
> >
> > -#define ASID_MASK(info)                      (~GENMASK((info)->bits - 1, 0))
> > -#define NUM_ASIDS(info)                      (1UL << ((info)->bits))
> > -
> > -#define ASID_FIRST_VERSION(info)     NUM_ASIDS(info)
> > -
> >   #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> >   #define ASID_PER_CONTEXT            2
> >   #else
> >   #define ASID_PER_CONTEXT            1
> >   #endif
> >
> > -#define NUM_CTXT_ASIDS(info)         (NUM_ASIDS(info) >> (info)->ctxt_shift)
> > -#define asid2idx(info, asid)         (((asid) & ~ASID_MASK(info)) >> (info)->ctxt_shift)
> > -#define idx2asid(info, idx)          (((idx) << (info)->ctxt_shift) & ~ASID_MASK(info))
> > +struct asid_info asid_info;
> >
> >   /* Get the ASIDBits supported by the current CPU */
> >   static u32 get_cpu_asid_bits(void)
> > @@ -102,178 +77,6 @@ void verify_cpu_asid_bits(void)
> >       }
> >   }
> >
> > -static void flush_context(struct asid_info *info)
> > -{
> > -     int i;
> > -     u64 asid;
> > -
> > -     /* Update the list of reserved ASIDs and the ASID bitmap. */
> > -     bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
> > -
> > -     for_each_possible_cpu(i) {
> > -             asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
> > -             /*
> > -              * If this CPU has already been through a
> > -              * rollover, but hasn't run another task in
> > -              * the meantime, we must preserve its reserved
> > -              * ASID, as this is the only trace we have of
> > -              * the process it is still running.
> > -              */
> > -             if (asid == 0)
> > -                     asid = reserved_asid(info, i);
> > -             __set_bit(asid2idx(info, asid), info->map);
> > -             reserved_asid(info, i) = asid;
> > -     }
> > -
> > -     /*
> > -      * Queue a TLB invalidation for each CPU to perform on next
> > -      * context-switch
> > -      */
> > -     cpumask_setall(&info->flush_pending);
> > -}
> > -
> > -static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
> > -                                    u64 newasid)
> > -{
> > -     int cpu;
> > -     bool hit = false;
> > -
> > -     /*
> > -      * Iterate over the set of reserved ASIDs looking for a match.
> > -      * If we find one, then we can update our mm to use newasid
> > -      * (i.e. the same ASID in the current generation) but we can't
> > -      * exit the loop early, since we need to ensure that all copies
> > -      * of the old ASID are updated to reflect the mm. Failure to do
> > -      * so could result in us missing the reserved ASID in a future
> > -      * generation.
> > -      */
> > -     for_each_possible_cpu(cpu) {
> > -             if (reserved_asid(info, cpu) == asid) {
> > -                     hit = true;
> > -                     reserved_asid(info, cpu) = newasid;
> > -             }
> > -     }
> > -
> > -     return hit;
> > -}
> > -
> > -static u64 new_context(struct asid_info *info, atomic64_t *pasid)
> > -{
> > -     static u32 cur_idx = 1;
> > -     u64 asid = atomic64_read(pasid);
> > -     u64 generation = atomic64_read(&info->generation);
> > -
> > -     if (asid != 0) {
> > -             u64 newasid = generation | (asid & ~ASID_MASK(info));
> > -
> > -             /*
> > -              * If our current ASID was active during a rollover, we
> > -              * can continue to use it and this was just a false alarm.
> > -              */
> > -             if (check_update_reserved_asid(info, asid, newasid))
> > -                     return newasid;
> > -
> > -             /*
> > -              * We had a valid ASID in a previous life, so try to re-use
> > -              * it if possible.
> > -              */
> > -             if (!__test_and_set_bit(asid2idx(info, asid), info->map))
> > -                     return newasid;
> > -     }
> > -
> > -     /*
> > -      * Allocate a free ASID. If we can't find one, take a note of the
> > -      * currently active ASIDs and mark the TLBs as requiring flushes.  We
> > -      * always count from ASID #2 (index 1), as we use ASID #0 when setting
> > -      * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
> > -      * pairs.
> > -      */
> > -     asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
> > -     if (asid != NUM_CTXT_ASIDS(info))
> > -             goto set_asid;
> > -
> > -     /* We're out of ASIDs, so increment the global generation count */
> > -     generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
> > -                                              &info->generation);
> > -     flush_context(info);
> > -
> > -     /* We have more ASIDs than CPUs, so this will always succeed */
> > -     asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
> > -
> > -set_asid:
> > -     __set_bit(asid, info->map);
> > -     cur_idx = asid;
> > -     return idx2asid(info, asid) | generation;
> > -}
> > -
> > -static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> > -                          unsigned int cpu);
> > -
> > -/*
> > - * Check the ASID is still valid for the context. If not generate a new ASID.
> > - *
> > - * @pasid: Pointer to the current ASID batch
> > - * @cpu: current CPU ID. Must have been acquired throught get_cpu()
> > - */
> > -static void asid_check_context(struct asid_info *info,
> > -                            atomic64_t *pasid, unsigned int cpu)
> > -{
> > -     u64 asid, old_active_asid;
> > -
> > -     asid = atomic64_read(pasid);
> > -
> > -     /*
> > -      * The memory ordering here is subtle.
> > -      * If our active_asid is non-zero and the ASID matches the current
> > -      * generation, then we update the active_asid entry with a relaxed
> > -      * cmpxchg. Racing with a concurrent rollover means that either:
> > -      *
> > -      * - We get a zero back from the cmpxchg and end up waiting on the
> > -      *   lock. Taking the lock synchronises with the rollover and so
> > -      *   we are forced to see the updated generation.
> > -      *
> > -      * - We get a valid ASID back from the cmpxchg, which means the
> > -      *   relaxed xchg in flush_context will treat us as reserved
> > -      *   because atomic RmWs are totally ordered for a given location.
> > -      */
> > -     old_active_asid = atomic64_read(&active_asid(info, cpu));
> > -     if (old_active_asid &&
> > -         !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
> > -         atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
> > -                                  old_active_asid, asid))
> > -             return;
> > -
> > -     asid_new_context(info, pasid, cpu);
> > -}
> > -
> > -/*
> > - * Generate a new ASID for the context.
> > - *
> > - * @pasid: Pointer to the current ASID batch allocated. It will be updated
> > - * with the new ASID batch.
> > - * @cpu: current CPU ID. Must have been acquired through get_cpu()
> > - */
> > -static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
> > -                          unsigned int cpu)
> > -{
> > -     unsigned long flags;
> > -     u64 asid;
> > -
> > -     raw_spin_lock_irqsave(&info->lock, flags);
> > -     /* Check that our ASID belongs to the current generation. */
> > -     asid = atomic64_read(pasid);
> > -     if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
> > -             asid = new_context(info, pasid);
> > -             atomic64_set(pasid, asid);
> > -     }
> > -
> > -     if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
> > -             info->flush_cpu_ctxt_cb();
> > -
> > -     atomic64_set(&active_asid(info, cpu), asid);
> > -     raw_spin_unlock_irqrestore(&info->lock, flags);
> > -}
> > -
> >   void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
> >   {
> >       if (system_supports_cnp())
> > @@ -305,38 +108,6 @@ static void asid_flush_cpu_ctxt(void)
> >       local_flush_tlb_all();
> >   }
> >
> > -/*
> > - * Initialize the ASID allocator
> > - *
> > - * @info: Pointer to the asid allocator structure
> > - * @bits: Number of ASIDs available
> > - * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
> > - * allocated contiguously for a given context. This value should be a power of
> > - * 2.
> > - */
> > -static int asid_allocator_init(struct asid_info *info,
> > -                            u32 bits, unsigned int asid_per_ctxt,
> > -                            void (*flush_cpu_ctxt_cb)(void))
> > -{
> > -     info->bits = bits;
> > -     info->ctxt_shift = ilog2(asid_per_ctxt);
> > -     info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
> > -     /*
> > -      * Expect allocation after rollover to fail if we don't have at least
> > -      * one more ASID than CPUs. ASID #0 is always reserved.
> > -      */
> > -     WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
> > -     atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
> > -     info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
> > -                         sizeof(*info->map), GFP_KERNEL);
> > -     if (!info->map)
> > -             return -ENOMEM;
> > -
> > -     raw_spin_lock_init(&info->lock);
> > -
> > -     return 0;
> > -}
> > -
> >   static int asids_init(void)
> >   {
> >       u32 bits = get_cpu_asid_bits();
> > @@ -344,7 +115,7 @@ static int asids_init(void)
> >       if (!asid_allocator_init(&asid_info, bits, ASID_PER_CONTEXT,
> >                                asid_flush_cpu_ctxt))
> >               panic("Unable to initialize ASID allocator for %lu ASIDs\n",
> > -                   1UL << bits);
> > +                   NUM_ASIDS(&asid_info));
> >
> >       asid_info.active = &active_asids;
> >       asid_info.reserved = &reserved_asids;
> >
>
> --
> Julien Grall
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Julien Grall June 19, 2019, 8:54 a.m. UTC | #5
On 6/19/19 9:07 AM, Guo Ren wrote:
> Hi Julien,

Hi,

> 
> You forgot CCing C-SKY folks :P

I wasn't aware you could be interested :).

> 
> Move arm asid allocator code in a generic one is a agood idea, I've
> made a patchset for C-SKY and test is on processing, See:
> https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
> 
> If you plan to seperate it into generic one, I could co-work with you.

Was the ASID allocator work out of box on C-Sky? If so, I can easily 
move the code in a generic place (maybe lib/asid.c).

Cheers,
Will Deacon June 19, 2019, 9:12 a.m. UTC | #6
On Wed, Jun 19, 2019 at 09:54:21AM +0100, Julien Grall wrote:
> On 6/19/19 9:07 AM, Guo Ren wrote:
> > You forgot CCing C-SKY folks :P
> 
> I wasn't aware you could be interested :).
> 
> > Move arm asid allocator code in a generic one is a agood idea, I've
> > made a patchset for C-SKY and test is on processing, See:
> > https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
> > 
> > If you plan to seperate it into generic one, I could co-work with you.
> 
> Was the ASID allocator work out of box on C-Sky? If so, I can easily move
> the code in a generic place (maybe lib/asid.c).

This is one place where I'd actually prefer not to go down the route of
making the code generic. Context-switching and low-level TLB management
is deeply architecture-specific and I worry that by trying to make this
code common, we run the real risk of introducing subtle bugs on some
architecture every time it is changed. Furthermore, the algorithm we use
on arm64 is designed to scale to large systems using DVM and may well be
too complex and/or sub-optimal for architectures with different system
topologies or TLB invalidation mechanisms.

It's not a lot of code, so I don't see that it's a big deal to keep it
under arch/arm64.

Will
Guo Ren June 19, 2019, 11:51 a.m. UTC | #7
On Wed, Jun 19, 2019 at 4:54 PM Julien Grall <julien.grall@arm.com> wrote:
>
>
>
> On 6/19/19 9:07 AM, Guo Ren wrote:
> > Hi Julien,
>
> Hi,
>
> >
> > You forgot CCing C-SKY folks :P
>
> I wasn't aware you could be interested :).
>
> >
> > Move arm asid allocator code in a generic one is a agood idea, I've
> > made a patchset for C-SKY and test is on processing, See:
> > https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
> >
> > If you plan to seperate it into generic one, I could co-work with you.
>
> Was the ASID allocator work out of box on C-Sky?
Almost done, but one question:
arm64 remove the code in switch_mm:
  cpumask_clear_cpu(cpu, mm_cpumask(prev));
  cpumask_set_cpu(cpu, mm_cpumask(next));

Why? Although arm64 cache operations could affect all harts with CTC
method of interconnect, I think we should
keep these code for primitive integrity in linux. Because cpu_bitmap
is in mm_struct instead of mm->context.

In current csky's patches I've also removed the codes the same as
arm64, but I'll add it back at next version.

> If so, I can easily move the code in a generic place (maybe lib/asid.c).
I think it's OK.

Best Regards
 Guo Ren
Guo Ren June 19, 2019, 12:18 p.m. UTC | #8
On Wed, Jun 19, 2019 at 5:12 PM Will Deacon <will.deacon@arm.com> wrote:
>
> On Wed, Jun 19, 2019 at 09:54:21AM +0100, Julien Grall wrote:
> > On 6/19/19 9:07 AM, Guo Ren wrote:
> > > You forgot CCing C-SKY folks :P
> >
> > I wasn't aware you could be interested :).
> >
> > > Move arm asid allocator code in a generic one is a agood idea, I've
> > > made a patchset for C-SKY and test is on processing, See:
> > > https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
> > >
> > > If you plan to seperate it into generic one, I could co-work with you.
> >
> > Was the ASID allocator work out of box on C-Sky? If so, I can easily move
> > the code in a generic place (maybe lib/asid.c).
>
> This is one place where I'd actually prefer not to go down the route of
> making the code generic. Context-switching and low-level TLB management
> is deeply architecture-specific and I worry that by trying to make this
> code common, we run the real risk of introducing subtle bugs on some
> architecture every time it is changed.
"Add generic asid code" and "move arm's into generic" are two things.
We could do
first and let architecture's maintainer to choose.

> Furthermore, the algorithm we use
> on arm64 is designed to scale to large systems using DVM and may well be
> too complex and/or sub-optimal for architectures with different system
> topologies or TLB invalidation mechanisms.
It's just a asid algorithm not very complex and there is a callback
for architecture to define their
own local hart tlb flush. Seems it has nothing with DVM or tlb
broadcast mechanism.

>
> It's not a lot of code, so I don't see that it's a big deal to keep it
> under arch/arm64.
Yes, I think that's ok for arm64.

Hi Arnd,
What do you think about adding generic asid code for arch selection?

Best Regards
 Guo Ren
Will Deacon June 19, 2019, 12:39 p.m. UTC | #9
On Wed, Jun 19, 2019 at 08:18:04PM +0800, Guo Ren wrote:
> On Wed, Jun 19, 2019 at 5:12 PM Will Deacon <will.deacon@arm.com> wrote:
> > This is one place where I'd actually prefer not to go down the route of
> > making the code generic. Context-switching and low-level TLB management
> > is deeply architecture-specific and I worry that by trying to make this
> > code common, we run the real risk of introducing subtle bugs on some
> > architecture every time it is changed.
> "Add generic asid code" and "move arm's into generic" are two things.
> We could do
> first and let architecture's maintainer to choose.

If I understand the proposal being discussed, it involves basing that
generic ASID allocation code around the arm64 implementation which I don't
necessarily think is a good starting point.

> > Furthermore, the algorithm we use
> > on arm64 is designed to scale to large systems using DVM and may well be
> > too complex and/or sub-optimal for architectures with different system
> > topologies or TLB invalidation mechanisms.
> It's just a asid algorithm not very complex and there is a callback
> for architecture to define their
> own local hart tlb flush. Seems it has nothing with DVM or tlb
> broadcast mechanism.

I'm pleased that you think the algorithm is not very complex, but I'm also
worried that you might not have fully understood some of its finer details.

The reason I mention DVM and TLB broadcasting is because, depending on
the mechanisms in your architecture relating to those, it may be strictly
required that all concurrently running threads of a process have the same
ASID at any given point in time, or it may be that you really don't care.

If you don't care, then the arm64 allocator is over-engineered and likely
inefficient for your system. If you do care, then it's worth considering
whether a lock is sufficient around the allocator if you don't expect high
core counts. Another possibility is that you end up using only one ASID and
invalidating the local TLB on every context switch. Yet another design
would be to manage per-cpu ASID pools.

So rather than blindly copying the arm64 code, I suggest sitting down and
designing something that fits to your architecture instead. You may end up
with something that is both simpler and more efficient.

Will
Julien Grall June 19, 2019, 12:52 p.m. UTC | #10
Hi Guo,

On 19/06/2019 12:51, Guo Ren wrote:
> On Wed, Jun 19, 2019 at 4:54 PM Julien Grall <julien.grall@arm.com> wrote:
>>
>>
>>
>> On 6/19/19 9:07 AM, Guo Ren wrote:
>>> Hi Julien,
>>
>> Hi,
>>
>>>
>>> You forgot CCing C-SKY folks :P
>>
>> I wasn't aware you could be interested :).
>>
>>>
>>> Move arm asid allocator code in a generic one is a agood idea, I've
>>> made a patchset for C-SKY and test is on processing, See:
>>> https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
>>>
>>> If you plan to seperate it into generic one, I could co-work with you.
>>
>> Was the ASID allocator work out of box on C-Sky?
> Almost done, but one question:
> arm64 remove the code in switch_mm:
>    cpumask_clear_cpu(cpu, mm_cpumask(prev));
>    cpumask_set_cpu(cpu, mm_cpumask(next));


> 
> Why? Although arm64 cache operations could affect all harts with CTC
> method of interconnect, I think we should
> keep these code for primitive integrity in linux. Because cpu_bitmap
> is in mm_struct instead of mm->context.

I will let Will answer to this.

[...]

>> If so, I can easily move the code in a generic place (maybe lib/asid.c).
> I think it's OK.

Will emits concern to move the code in lib. So I will stick with what I 
currently have.

Cheers,
Guo Ren June 20, 2019, 9:33 a.m. UTC | #11
On Wed, Jun 19, 2019 at 8:39 PM Will Deacon <will.deacon@arm.com> wrote:
>
> On Wed, Jun 19, 2019 at 08:18:04PM +0800, Guo Ren wrote:
> > On Wed, Jun 19, 2019 at 5:12 PM Will Deacon <will.deacon@arm.com> wrote:
> > > This is one place where I'd actually prefer not to go down the route of
> > > making the code generic. Context-switching and low-level TLB management
> > > is deeply architecture-specific and I worry that by trying to make this
> > > code common, we run the real risk of introducing subtle bugs on some
> > > architecture every time it is changed.
> > "Add generic asid code" and "move arm's into generic" are two things.
> > We could do
> > first and let architecture's maintainer to choose.
>
> If I understand the proposal being discussed, it involves basing that
> generic ASID allocation code around the arm64 implementation which I don't
> necessarily think is a good starting point.
...
>
> > > Furthermore, the algorithm we use
> > > on arm64 is designed to scale to large systems using DVM and may well be
> > > too complex and/or sub-optimal for architectures with different system
> > > topologies or TLB invalidation mechanisms.
> > It's just a asid algorithm not very complex and there is a callback
> > for architecture to define their
> > own local hart tlb flush. Seems it has nothing with DVM or tlb
> > broadcast mechanism.
>
> I'm pleased that you think the algorithm is not very complex, but I'm also
> worried that you might not have fully understood some of its finer details.
I understand your concern about my less understanding of asid
technology. Here is
my short-description of arm64 asid allocator: (If you find anything
wrong, please
correct me directly, thx :)
The asid allocator use five level check to reduce the cost of switch_mm.
 1. Check if the asid version is the same (it's general)
 2. Check reserved_asid which is set in rollover flush_context() and
the key point is
     keep the same bit position with the current asid version instead
of input version.
 3. Check if the position of bitmap is free then it could be set &
used directly.
 4. find_next_zero_bit() (a little performance cost)
 5. flush_context  (this is the worst cost with increase current asid version)

Check is level by level and cost is also higher with the next level.
The design that
impressed me the most was reserved_asid and bitmap and the 2th level and 3th
level will prevent unnecessary find_next_zero_bit().

The atomic 64 bit asid is also ok for 32-bit system and it won't cost
a lot in 1th 2th 3th
level check.

The operation of set/clear mm_cpumask was removed in arm64 compared to arm32.
It seems no side effect on current arm64 system, but from software
meaning it's wrong.
So I think it should be reserved in generic version.

>
> The reason I mention DVM and TLB broadcasting is because, depending on
> the mechanisms in your architecture relating to those, it may be strictly
> required that all concurrently running threads of a process have the same
> ASID at any given point in time, or it may be that you really don't care.
>
> If you don't care, then the arm64 allocator is over-engineered and likely
> inefficient for your system. If you do care, then it's worth considering
> whether a lock is sufficient around the allocator if you don't expect high
> core counts. Another possibility is that you end up using only one ASID and
> invalidating the local TLB on every context switch. Yet another design
> would be to manage per-cpu ASID pools.
I'll keep my system use the same ASID for SMP + IOMMU :P
Yes, there are two styles of asid allocator: per-cpu ASID (MIPS) or
same ASID (ARM).
If the CPU couldn't support cache/tlb coherency maintian in hardware,
it should use
per-cpu ASID style because IPI is expensive and per-cpu ASID style
need more software
mechanism to improve performance (eg: delay cache flush). From software view the
same ASID is clearer and easier to build bigger system with more TLB caches.

I think the same ASID style is a more sensible choice for modern
processor and let it be
one of generic is reasonable.

>
> So rather than blindly copying the arm64 code, I suggest sitting down and
> designing something that fits to your architecture instead. You may end up
> with something that is both simpler and more efficient.
In fact, riscv folks have discussed a lot about arm's asid allocator
and I learned
a lot from the discussion:
https://lore.kernel.org/linux-riscv/20190327100201.32220-1-anup.patel@wdc.com/

We are developing C-SKY and RISC-V ISA cpu cores and make it generic
is good for us.

Best Regards
 Guo Ren
Catalin Marinas June 21, 2019, 2:16 p.m. UTC | #12
On Wed, Jun 19, 2019 at 07:51:03PM +0800, Guo Ren wrote:
> On Wed, Jun 19, 2019 at 4:54 PM Julien Grall <julien.grall@arm.com> wrote:
> > On 6/19/19 9:07 AM, Guo Ren wrote:
> > > Move arm asid allocator code in a generic one is a agood idea, I've
> > > made a patchset for C-SKY and test is on processing, See:
> > > https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
> > >
> > > If you plan to seperate it into generic one, I could co-work with you.
> >
> > Was the ASID allocator work out of box on C-Sky?
> 
> Almost done, but one question:
> arm64 remove the code in switch_mm:
>   cpumask_clear_cpu(cpu, mm_cpumask(prev));
>   cpumask_set_cpu(cpu, mm_cpumask(next));
> 
> Why? Although arm64 cache operations could affect all harts with CTC
> method of interconnect, I think we should keep these code for
> primitive integrity in linux. Because cpu_bitmap is in mm_struct
> instead of mm->context.

We didn't have a use for this in the arm64 code, so no point in
maintaining the mm_cpumask. On some arm32 systems (ARMv6) with no
hardware broadcast of some TLB/cache operations, we use it to track
where the task has run to issue IPI for TLB invalidation or some
deferred I-cache invalidation.

(there was also a potential optimisation on arm64 to avoid broadcast
TLBI if the task only ran on a single CPU but Will found that was rarely
the case on an SMP system because of rebalancing happening during
execve(), ending up with two bits set in the mm_cpumask)

The way you use it on csky is different from how it is done on arm. It
seems to clear the mask for the scheduled out (prev) task but this
wouldn't work on arm(64) since the TLB still contains prev entries
tagged with the scheduled out ASID. Whether it matters, I guess it
depends on the specifics of your hardware.

While the algorithm may seem fairly generic, the semantics have a few
corner cases specific to each architecture. See [1] for a description of
the semantics we need on arm64 (CnP is a feature where the hardware
threads of the same core can share the TLB; the original algorithm
violated the requirements when this feature was enabled).

BTW, if you find the algorithm fairly straightforward ;), see this
bug-fix which took a formal model to identify: a8ffaaa060b8 ("arm64:
asid: Do not replace active_asids if already 0").

[1] https://git.kernel.org/pub/scm/linux/kernel/git/cmarinas/kernel-tla.git/tree/asidalloc.tla#n79
Guo Ren June 23, 2019, 4:35 p.m. UTC | #13
Thx Catalin,

On Fri, Jun 21, 2019 at 10:16 PM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Wed, Jun 19, 2019 at 07:51:03PM +0800, Guo Ren wrote:
> > On Wed, Jun 19, 2019 at 4:54 PM Julien Grall <julien.grall@arm.com> wrote:
> > > On 6/19/19 9:07 AM, Guo Ren wrote:
> > > > Move arm asid allocator code in a generic one is a agood idea, I've
> > > > made a patchset for C-SKY and test is on processing, See:
> > > > https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
> > > >
> > > > If you plan to seperate it into generic one, I could co-work with you.
> > >
> > > Was the ASID allocator work out of box on C-Sky?
> >
> > Almost done, but one question:
> > arm64 remove the code in switch_mm:
> >   cpumask_clear_cpu(cpu, mm_cpumask(prev));
> >   cpumask_set_cpu(cpu, mm_cpumask(next));
> >
> > Why? Although arm64 cache operations could affect all harts with CTC
> > method of interconnect, I think we should keep these code for
> > primitive integrity in linux. Because cpu_bitmap is in mm_struct
> > instead of mm->context.
>
> We didn't have a use for this in the arm64 code, so no point in
> maintaining the mm_cpumask. On some arm32 systems (ARMv6) with no
> hardware broadcast of some TLB/cache operations, we use it to track
> where the task has run to issue IPI for TLB invalidation or some
> deferred I-cache invalidation.
The operation of set/clear mm_cpumask was removed in arm64 compared to
arm32. It seems no side effect on current arm64 system, but from
software meaning it's wrong.
I think we should keep mm_cpumask just like arm32.

>
> (there was also a potential optimisation on arm64 to avoid broadcast
> TLBI if the task only ran on a single CPU but Will found that was rarely
> the case on an SMP system because of rebalancing happening during
> execve(), ending up with two bits set in the mm_cpumask)
>
> The way you use it on csky is different from how it is done on arm. It
> seems to clear the mask for the scheduled out (prev) task but this
> wouldn't work on arm(64) since the TLB still contains prev entries
> tagged with the scheduled out ASID. Whether it matters, I guess it
> depends on the specifics of your hardware.
Sorry for the mistake quote, what I mean is what is done in arm32:
clear all bits of mm->cpu_mask in new_context(), and set back one by
one. Here is my patch:
https://lore.kernel.org/linux-csky/CAJF2gTQ0xQtQY1t-g9FgWaxfDXppMkFooCQzTFy7+ouwUfyA6w@mail.gmail.com/T/#m2ed464d2dfb45ac6f5547fb3936adf2da456cb65

>
> While the algorithm may seem fairly generic, the semantics have a few
> corner cases specific to each architecture. See [1] for a description of
> the semantics we need on arm64 (CnP is a feature where the hardware
> threads of the same core can share the TLB; the original algorithm
> violated the requirements when this feature was enabled).
C-SKY SMP is only one hart per core, but here is a patch [1] with my
thought on SMT duplicate tlb flush:
[1] https://lore.kernel.org/linux-csky/1561305869-18872-1-git-send-email-guoren@kernel.org/T/#u

For TLA+ model, I still need some learning before I could talk with you.

>
> BTW, if you find the algorithm fairly straightforward ;), see this
> bug-fix which took a formal model to identify: a8ffaaa060b8 ("arm64:
> asid: Do not replace active_asids if already 0").
I think it's one fo the cases that other archs also could get benefit
from arm's asid allocator code.
Btw, Is this detected by arm's aisd allocator TLA+ model ? Or a real
bug report ?

--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Will Deacon June 24, 2019, 10:22 a.m. UTC | #14
On Mon, Jun 24, 2019 at 12:35:35AM +0800, Guo Ren wrote:
> On Fri, Jun 21, 2019 at 10:16 PM Catalin Marinas
> <catalin.marinas@arm.com> wrote:
> >
> > On Wed, Jun 19, 2019 at 07:51:03PM +0800, Guo Ren wrote:
> > > On Wed, Jun 19, 2019 at 4:54 PM Julien Grall <julien.grall@arm.com> wrote:
> > > > On 6/19/19 9:07 AM, Guo Ren wrote:
> > > > > Move arm asid allocator code in a generic one is a agood idea, I've
> > > > > made a patchset for C-SKY and test is on processing, See:
> > > > > https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
> > > > >
> > > > > If you plan to seperate it into generic one, I could co-work with you.
> > > >
> > > > Was the ASID allocator work out of box on C-Sky?
> > >
> > > Almost done, but one question:
> > > arm64 remove the code in switch_mm:
> > >   cpumask_clear_cpu(cpu, mm_cpumask(prev));
> > >   cpumask_set_cpu(cpu, mm_cpumask(next));
> > >
> > > Why? Although arm64 cache operations could affect all harts with CTC
> > > method of interconnect, I think we should keep these code for
> > > primitive integrity in linux. Because cpu_bitmap is in mm_struct
> > > instead of mm->context.
> >
> > We didn't have a use for this in the arm64 code, so no point in
> > maintaining the mm_cpumask. On some arm32 systems (ARMv6) with no
> > hardware broadcast of some TLB/cache operations, we use it to track
> > where the task has run to issue IPI for TLB invalidation or some
> > deferred I-cache invalidation.
> The operation of set/clear mm_cpumask was removed in arm64 compared to
> arm32. It seems no side effect on current arm64 system, but from
> software meaning it's wrong.
> I think we should keep mm_cpumask just like arm32.

It was a while ago now, but I remember the atomic update of the mm_cpumask
being quite expensive when I was profiling this stuff, so I removed it
because we don't need it for arm64 (at least, it doesn't allow us to
optimise our shootdowns in practice).

I still think this is over-engineered for what you want on c-sky and making
this code generic is a mistake.

Will
Will Deacon June 24, 2019, 10:40 a.m. UTC | #15
On Thu, Jun 20, 2019 at 05:33:03PM +0800, Guo Ren wrote:
> On Wed, Jun 19, 2019 at 8:39 PM Will Deacon <will.deacon@arm.com> wrote:
> >
> > On Wed, Jun 19, 2019 at 08:18:04PM +0800, Guo Ren wrote:
> > > On Wed, Jun 19, 2019 at 5:12 PM Will Deacon <will.deacon@arm.com> wrote:
> > > > This is one place where I'd actually prefer not to go down the route of
> > > > making the code generic. Context-switching and low-level TLB management
> > > > is deeply architecture-specific and I worry that by trying to make this
> > > > code common, we run the real risk of introducing subtle bugs on some
> > > > architecture every time it is changed.
> > > "Add generic asid code" and "move arm's into generic" are two things.
> > > We could do
> > > first and let architecture's maintainer to choose.
> >
> > If I understand the proposal being discussed, it involves basing that
> > generic ASID allocation code around the arm64 implementation which I don't
> > necessarily think is a good starting point.
> ...
> >
> > > > Furthermore, the algorithm we use
> > > > on arm64 is designed to scale to large systems using DVM and may well be
> > > > too complex and/or sub-optimal for architectures with different system
> > > > topologies or TLB invalidation mechanisms.
> > > It's just a asid algorithm not very complex and there is a callback
> > > for architecture to define their
> > > own local hart tlb flush. Seems it has nothing with DVM or tlb
> > > broadcast mechanism.
> >
> > I'm pleased that you think the algorithm is not very complex, but I'm also
> > worried that you might not have fully understood some of its finer details.
> I understand your concern about my less understanding of asid
> technology. Here is
> my short-description of arm64 asid allocator: (If you find anything
> wrong, please
> correct me directly, thx :)

The complexity mainly comes from the fact that this thing runs concurrently
with itself without synchronization on the fast-path. Coupled with the
need to use the same ASID for all threads of a task, you end up in fiddly
situations where rollover can occur on one CPU whilst another CPU is trying
to schedule a thread of a task that already has threads running in
userspace.

However, it's architecture-specific whether or not you care about that
scenario.

> > The reason I mention DVM and TLB broadcasting is because, depending on
> > the mechanisms in your architecture relating to those, it may be strictly
> > required that all concurrently running threads of a process have the same
> > ASID at any given point in time, or it may be that you really don't care.
> >
> > If you don't care, then the arm64 allocator is over-engineered and likely
> > inefficient for your system. If you do care, then it's worth considering
> > whether a lock is sufficient around the allocator if you don't expect high
> > core counts. Another possibility is that you end up using only one ASID and
> > invalidating the local TLB on every context switch. Yet another design
> > would be to manage per-cpu ASID pools.
> I'll keep my system use the same ASID for SMP + IOMMU :P

You will want a separate allocator for that:

https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.brucker@arm.com

> Yes, there are two styles of asid allocator: per-cpu ASID (MIPS) or
> same ASID (ARM).
> If the CPU couldn't support cache/tlb coherency maintian in hardware,
> it should use
> per-cpu ASID style because IPI is expensive and per-cpu ASID style
> need more software
> mechanism to improve performance (eg: delay cache flush). From software view the
> same ASID is clearer and easier to build bigger system with more TLB caches.
> 
> I think the same ASID style is a more sensible choice for modern
> processor and let it be
> one of generic is reasonable.

I'm not sure I agree. x86, for example, is better off using a different
algorithm for allocating its PCIDs.

> > So rather than blindly copying the arm64 code, I suggest sitting down and
> > designing something that fits to your architecture instead. You may end up
> > with something that is both simpler and more efficient.
> In fact, riscv folks have discussed a lot about arm's asid allocator
> and I learned
> a lot from the discussion:
> https://lore.kernel.org/linux-riscv/20190327100201.32220-1-anup.patel@wdc.com/

If you require all threads of the same process to have the same ASID, then
that patch looks broken to me.

Will
Catalin Marinas June 24, 2019, 3:38 p.m. UTC | #16
On Mon, Jun 24, 2019 at 12:35:35AM +0800, Guo Ren wrote:
> On Fri, Jun 21, 2019 at 10:16 PM Catalin Marinas
> <catalin.marinas@arm.com> wrote:
> > BTW, if you find the algorithm fairly straightforward ;), see this
> > bug-fix which took a formal model to identify: a8ffaaa060b8 ("arm64:
> > asid: Do not replace active_asids if already 0").
[...]
> Btw, Is this detected by arm's aisd allocator TLA+ model ? Or a real
> bug report ?

This specific bug was found by the TLA+ model checker (at the time we
were actually tracking down another bug with multi-threaded CPU sharing
the TLB, bug also confirmed by the formal model).
Palmer Dabbelt June 25, 2019, 7:25 a.m. UTC | #17
On Mon, 24 Jun 2019 03:40:07 PDT (-0700), will@kernel.org wrote:
> On Thu, Jun 20, 2019 at 05:33:03PM +0800, Guo Ren wrote:
>> On Wed, Jun 19, 2019 at 8:39 PM Will Deacon <will.deacon@arm.com> wrote:
>> >
>> > On Wed, Jun 19, 2019 at 08:18:04PM +0800, Guo Ren wrote:
>> > > On Wed, Jun 19, 2019 at 5:12 PM Will Deacon <will.deacon@arm.com> wrote:
>> > > > This is one place where I'd actually prefer not to go down the route of
>> > > > making the code generic. Context-switching and low-level TLB management
>> > > > is deeply architecture-specific and I worry that by trying to make this
>> > > > code common, we run the real risk of introducing subtle bugs on some
>> > > > architecture every time it is changed.
>> > > "Add generic asid code" and "move arm's into generic" are two things.
>> > > We could do
>> > > first and let architecture's maintainer to choose.
>> >
>> > If I understand the proposal being discussed, it involves basing that
>> > generic ASID allocation code around the arm64 implementation which I don't
>> > necessarily think is a good starting point.
>> ...
>> >
>> > > > Furthermore, the algorithm we use
>> > > > on arm64 is designed to scale to large systems using DVM and may well be
>> > > > too complex and/or sub-optimal for architectures with different system
>> > > > topologies or TLB invalidation mechanisms.
>> > > It's just a asid algorithm not very complex and there is a callback
>> > > for architecture to define their
>> > > own local hart tlb flush. Seems it has nothing with DVM or tlb
>> > > broadcast mechanism.
>> >
>> > I'm pleased that you think the algorithm is not very complex, but I'm also
>> > worried that you might not have fully understood some of its finer details.
>> I understand your concern about my less understanding of asid
>> technology. Here is
>> my short-description of arm64 asid allocator: (If you find anything
>> wrong, please
>> correct me directly, thx :)
>
> The complexity mainly comes from the fact that this thing runs concurrently
> with itself without synchronization on the fast-path. Coupled with the
> need to use the same ASID for all threads of a task, you end up in fiddly
> situations where rollover can occur on one CPU whilst another CPU is trying
> to schedule a thread of a task that already has threads running in
> userspace.
>
> However, it's architecture-specific whether or not you care about that
> scenario.
>
>> > The reason I mention DVM and TLB broadcasting is because, depending on
>> > the mechanisms in your architecture relating to those, it may be strictly
>> > required that all concurrently running threads of a process have the same
>> > ASID at any given point in time, or it may be that you really don't care.
>> >
>> > If you don't care, then the arm64 allocator is over-engineered and likely
>> > inefficient for your system. If you do care, then it's worth considering
>> > whether a lock is sufficient around the allocator if you don't expect high
>> > core counts. Another possibility is that you end up using only one ASID and
>> > invalidating the local TLB on every context switch. Yet another design
>> > would be to manage per-cpu ASID pools.

FWIW: right now we don't have any implementations that support ASIDs, so we're
really not ready to make these sort of decisions because we just don't know
what systems are going to look like.  While it's a fun intellectual exercise to
try to design an allocator that would work acceptably on systems of various
shapes, there's no way to test this for performance or correctness right now so
I wouldn't be comfortable taking anything.  If you're really interested, the
right place to start is the RTL

    https://github.com/chipsalliance/rocket-chip/blob/master/src/main/scala/rocket/TLB.scala#L19

This is essentially the same problem we have for our spinlocks -- maybe start
with the TLB before doing a whole new pipeline, though :)

>> I'll keep my system use the same ASID for SMP + IOMMU :P
>
> You will want a separate allocator for that:
>
> https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.brucker@arm.com
>
>> Yes, there are two styles of asid allocator: per-cpu ASID (MIPS) or
>> same ASID (ARM).
>> If the CPU couldn't support cache/tlb coherency maintian in hardware,
>> it should use
>> per-cpu ASID style because IPI is expensive and per-cpu ASID style
>> need more software
>> mechanism to improve performance (eg: delay cache flush). From software view the
>> same ASID is clearer and easier to build bigger system with more TLB caches.
>>
>> I think the same ASID style is a more sensible choice for modern
>> processor and let it be
>> one of generic is reasonable.
>
> I'm not sure I agree. x86, for example, is better off using a different
> algorithm for allocating its PCIDs.
>
>> > So rather than blindly copying the arm64 code, I suggest sitting down and
>> > designing something that fits to your architecture instead. You may end up
>> > with something that is both simpler and more efficient.
>> In fact, riscv folks have discussed a lot about arm's asid allocator
>> and I learned
>> a lot from the discussion:
>> https://lore.kernel.org/linux-riscv/20190327100201.32220-1-anup.patel@wdc.com/
>
> If you require all threads of the same process to have the same ASID, then
> that patch looks broken to me.
>
> Will
qi.fuli@fujitsu.com June 27, 2019, 9:41 a.m. UTC | #18
On 6/24/19 7:22 PM, Will Deacon wrote:
> On Mon, Jun 24, 2019 at 12:35:35AM +0800, Guo Ren wrote:
>> On Fri, Jun 21, 2019 at 10:16 PM Catalin Marinas
>> <catalin.marinas@arm.com> wrote:
>>> On Wed, Jun 19, 2019 at 07:51:03PM +0800, Guo Ren wrote:
>>>> On Wed, Jun 19, 2019 at 4:54 PM Julien Grall <julien.grall@arm.com> wrote:
>>>>> On 6/19/19 9:07 AM, Guo Ren wrote:
>>>>>> Move arm asid allocator code in a generic one is a agood idea, I've
>>>>>> made a patchset for C-SKY and test is on processing, See:
>>>>>> https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
>>>>>>
>>>>>> If you plan to seperate it into generic one, I could co-work with you.
>>>>> Was the ASID allocator work out of box on C-Sky?
>>>> Almost done, but one question:
>>>> arm64 remove the code in switch_mm:
>>>>    cpumask_clear_cpu(cpu, mm_cpumask(prev));
>>>>    cpumask_set_cpu(cpu, mm_cpumask(next));
>>>>
>>>> Why? Although arm64 cache operations could affect all harts with CTC
>>>> method of interconnect, I think we should keep these code for
>>>> primitive integrity in linux. Because cpu_bitmap is in mm_struct
>>>> instead of mm->context.
>>> We didn't have a use for this in the arm64 code, so no point in
>>> maintaining the mm_cpumask. On some arm32 systems (ARMv6) with no
>>> hardware broadcast of some TLB/cache operations, we use it to track
>>> where the task has run to issue IPI for TLB invalidation or some
>>> deferred I-cache invalidation.
>> The operation of set/clear mm_cpumask was removed in arm64 compared to
>> arm32. It seems no side effect on current arm64 system, but from
>> software meaning it's wrong.
>> I think we should keep mm_cpumask just like arm32.
> It was a while ago now, but I remember the atomic update of the mm_cpumask
> being quite expensive when I was profiling this stuff, so I removed it
> because we don't need it for arm64 (at least, it doesn't allow us to
> optimise our shootdowns in practice).

Hi Will,

I think mm_cpumask can be used for filtering the cpus that there are TBL 
entries on.
The OS jitter can be reduced by invalidating TLB entries only on the 
CPUs specified by mm_cpumask(mm).
As I mentioned in an earlier email, the 2.5% OS jitter can result in 
over a factor of 20 slowdown for the same application [1].
Though it may be an extreme example, reducing the OS jitter has been an 
issue in HPC environment.
I would like to avoid broadcast TLBI by using mm_cpumask on arm64, cloud 
you please tell me more about the costs caused by updating mm_cpumask?

Here is my patch:
https://lkml.org/lkml/2019/6/17/703

[1] Ferreira, Kurt B., Patrick Bridges, and Ron Brightwell. 
"Characterizing application sensitivity to OS interference using 
kernel-level noise injection." Proceedings of the 2008 ACM/IEEE 
conference on Supercomputing. IEEE Press, 2008.

Thanks,
QI Fuli

> I still think this is over-engineered for what you want on c-sky and making
> this code generic is a mistake.
>
> Will
Will Deacon June 27, 2019, 10:26 a.m. UTC | #19
On Thu, Jun 27, 2019 at 09:41:42AM +0000, qi.fuli@fujitsu.com wrote:
> 
> On 6/24/19 7:22 PM, Will Deacon wrote:
> > On Mon, Jun 24, 2019 at 12:35:35AM +0800, Guo Ren wrote:
> >> On Fri, Jun 21, 2019 at 10:16 PM Catalin Marinas
> >> <catalin.marinas@arm.com> wrote:
> >>> On Wed, Jun 19, 2019 at 07:51:03PM +0800, Guo Ren wrote:
> >>>> On Wed, Jun 19, 2019 at 4:54 PM Julien Grall <julien.grall@arm.com> wrote:
> >>>>> On 6/19/19 9:07 AM, Guo Ren wrote:
> >>>>>> Move arm asid allocator code in a generic one is a agood idea, I've
> >>>>>> made a patchset for C-SKY and test is on processing, See:
> >>>>>> https://lore.kernel.org/linux-csky/1560930553-26502-1-git-send-email-guoren@kernel.org/
> >>>>>>
> >>>>>> If you plan to seperate it into generic one, I could co-work with you.
> >>>>> Was the ASID allocator work out of box on C-Sky?
> >>>> Almost done, but one question:
> >>>> arm64 remove the code in switch_mm:
> >>>>    cpumask_clear_cpu(cpu, mm_cpumask(prev));
> >>>>    cpumask_set_cpu(cpu, mm_cpumask(next));
> >>>>
> >>>> Why? Although arm64 cache operations could affect all harts with CTC
> >>>> method of interconnect, I think we should keep these code for
> >>>> primitive integrity in linux. Because cpu_bitmap is in mm_struct
> >>>> instead of mm->context.
> >>> We didn't have a use for this in the arm64 code, so no point in
> >>> maintaining the mm_cpumask. On some arm32 systems (ARMv6) with no
> >>> hardware broadcast of some TLB/cache operations, we use it to track
> >>> where the task has run to issue IPI for TLB invalidation or some
> >>> deferred I-cache invalidation.
> >> The operation of set/clear mm_cpumask was removed in arm64 compared to
> >> arm32. It seems no side effect on current arm64 system, but from
> >> software meaning it's wrong.
> >> I think we should keep mm_cpumask just like arm32.
> > It was a while ago now, but I remember the atomic update of the mm_cpumask
> > being quite expensive when I was profiling this stuff, so I removed it
> > because we don't need it for arm64 (at least, it doesn't allow us to
> > optimise our shootdowns in practice).
> 
> I think mm_cpumask can be used for filtering the cpus that there are TBL 
> entries on.

I'm aware that you want to use IPIs for broadcasting TLB invalidation
but that is only tangentially related to this thread, which is about the
current ASID algorithm and the need to update mm_cpumask today.

Please don't conflate the two threads; I already made my position reasonably
clear:

https://lore.kernel.org/linux-arm-kernel/20190617170328.GJ30800@fuggles.cambridge.arm.com/

I will follow-up with another reply there.

Will
Guo Ren June 30, 2019, 4:29 a.m. UTC | #20
Hi Marinas,

Thx for the reply

On Mon, Jun 24, 2019 at 11:38 PM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Mon, Jun 24, 2019 at 12:35:35AM +0800, Guo Ren wrote:
> > On Fri, Jun 21, 2019 at 10:16 PM Catalin Marinas
> > <catalin.marinas@arm.com> wrote:
> > > BTW, if you find the algorithm fairly straightforward ;), see this
> > > bug-fix which took a formal model to identify: a8ffaaa060b8 ("arm64:
> > > asid: Do not replace active_asids if already 0").
> [...]
> > Btw, Is this detected by arm's aisd allocator TLA+ model ? Or a real
> > bug report ?
>
> This specific bug was found by the TLA+ model checker (at the time we
> were actually tracking down another bug with multi-threaded CPU sharing
> the TLB, bug also confirmed by the formal model).
Could you tell me the ref-link about "another bug with multi-threaded
CPU sharing the TLB" ?

In my concept, the multi-core asid mechanism is also applicable to
multi-thread shared TLB, but it will generate redundant tlbflush. From
the software design logic, multi-threaded is treated as multi-cores
without error, but performance is not optimized. So in my RFC PATCH:
[1], I try to reduce multi-threads' tlbflush in one CPU core with the
fixed cpu ID bitmap hypothesis.

1: https://lore.kernel.org/linux-csky/CAJF2gTQ0xQtQY1t-g9FgWaxfDXppMkFooCQzTFy7+ouwUfyA6w@mail.gmail.com/T/#m2ed464d2dfb45ac6f5547fb3936adf2da456cb65
--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Catalin Marinas July 1, 2019, 9:17 a.m. UTC | #21
On Sun, Jun 30, 2019 at 12:29:46PM +0800, Guo Ren wrote:
> On Mon, Jun 24, 2019 at 11:38 PM Catalin Marinas
> <catalin.marinas@arm.com> wrote:
> > On Mon, Jun 24, 2019 at 12:35:35AM +0800, Guo Ren wrote:
> > > On Fri, Jun 21, 2019 at 10:16 PM Catalin Marinas
> > > <catalin.marinas@arm.com> wrote:
> > > > BTW, if you find the algorithm fairly straightforward ;), see this
> > > > bug-fix which took a formal model to identify: a8ffaaa060b8 ("arm64:
> > > > asid: Do not replace active_asids if already 0").
> > [...]
> > > Btw, Is this detected by arm's aisd allocator TLA+ model ? Or a real
> > > bug report ?
> >
> > This specific bug was found by the TLA+ model checker (at the time we
> > were actually tracking down another bug with multi-threaded CPU sharing
> > the TLB, bug also confirmed by the formal model).
> 
> Could you tell me the ref-link about "another bug with multi-threaded
> CPU sharing the TLB" ?
> 
> In my concept, the multi-core asid mechanism is also applicable to
> multi-thread shared TLB, but it will generate redundant tlbflush. From
> the software design logic, multi-threaded is treated as multi-cores
> without error, but performance is not optimized.

From the ASID reservation/allocation perspective, the mechanism is the
same between multi-threaded with a shared TLB and multi-core. On arm64,
a local_flush_tlb_all() on a thread invalidates the TLB for the other
threads of the same core.

The actual problem with multi-threaded CPUs is a lot more subtle.
Digging some internal email from 1.5 years ago and pasting it below
(where "current ASID algorithm" refers to the one prior to the fix and
CnP - Common Not Private - means shared TLBs on a multi-threaded CPU):


The current ASID roll-over algorithm allows for a small window where
active_asids for a CPU (P1) is different from the actual ASID in TTBR0.
This can lead to a roll-over on a different CPU (P2) allocating an ASID
(for a different task) which is still hardware-active on P1.

A TLBI on a CPU (or a peer CPU with CnP) does not guarantee that all the
entries corresponding to a valid TTBRx are removed as they can still be
speculatively loaded immediately after TLBI.

While having two different page tables with the same ASID on different
CPUs should be fine without CnP, it becomes problematic when CnP is
enabled:

P1                                      P2
--                                      --
TTBR0.BADDR = T1
TTBR0.ASID = A1
check_and_switch_context(T2,A2)
  asid_maps[P1] = A2
  goto fastpath
                                        check_and_switch_context(T3,A0)
                                          new_context
                                            ASID roll-over allocates A1
                                              since it is not active
                                          TLBI ALL
speculate TTBR0.ASID = A1 entry
                                          TTBR0.BADDR = T3
                                          TTBR0.ASID = A1
  TTBR0.BADDR = T2
  TTBR0.ASID = A2

After this, the common TLB on P1 and P2 (CnP) contains entries
corresponding to the old T1 and A1. Task T3 using the same ASID A1 can
hit such entries. (T1,A1) will eventually be removed from the TLB on the
next context switch on P1 since tlb_flush_pending was set but this is
not guaranteed to happen.


The fix on arm64 (as part of 5ffdfaedfa0a - "arm64: mm: Support Common
Not Private translations") was to set the reserved TTBR0 in
check_and_switch_context(), preventing speculative loads into the TLB
being tagged with the wrong ASID. So this is specific to the ARM CPUs
behaviour w.r.t. speculative TLB loads, it may not be the case (yet) for
your architecture.
Guo Ren July 16, 2019, 3:31 a.m. UTC | #22
Hello Catalin,

Thanks for sharing about CnP assid experience. See my comment below.

On Mon, Jul 1, 2019 at 5:17 PM Catalin Marinas
> From the ASID reservation/allocation perspective, the mechanism is the
> same between multi-threaded with a shared TLB and multi-core. On arm64,
> a local_flush_tlb_all() on a thread invalidates the TLB for the other
> threads of the same core.
>
> The actual problem with multi-threaded CPUs is a lot more subtle.
> Digging some internal email from 1.5 years ago and pasting it below
> (where "current ASID algorithm" refers to the one prior to the fix and
> CnP - Common Not Private - means shared TLBs on a multi-threaded CPU):
>
>
> The current ASID roll-over algorithm allows for a small window where
> active_asids for a CPU (P1) is different from the actual ASID in TTBR0.
> This can lead to a roll-over on a different CPU (P2) allocating an ASID
> (for a different task) which is still hardware-active on P1.
>
> A TLBI on a CPU (or a peer CPU with CnP) does not guarantee that all the
> entries corresponding to a valid TTBRx are removed as they can still be
> speculatively loaded immediately after TLBI.
>
> While having two different page tables with the same ASID on different
> CPUs should be fine without CnP, it becomes problematic when CnP is
> enabled:
>
> P1                                      P2
> --                                      --
> TTBR0.BADDR = T1
> TTBR0.ASID = A1
> check_and_switch_context(T2,A2)
>   asid_maps[P1] = A2
>   goto fastpath
>                                         check_and_switch_context(T3,A0)
>                                           new_context
>                                             ASID roll-over allocates A1
>                                               since it is not active
>                                           TLBI ALL
> speculate TTBR0.ASID = A1 entry
>                                           TTBR0.BADDR = T3
>                                           TTBR0.ASID = A1
>   TTBR0.BADDR = T2
>   TTBR0.ASID = A2
>
> After this, the common TLB on P1 and P2 (CnP) contains entries
> corresponding to the old T1 and A1. Task T3 using the same ASID A1 can
> hit such entries. (T1,A1) will eventually be removed from the TLB on the
> next context switch on P1 since tlb_flush_pending was set but this is
> not guaranteed to happen.
>
>
> The fix on arm64 (as part of 5ffdfaedfa0a - "arm64: mm: Support Common
> Not Private translations") was to set the reserved TTBR0 in
> check_and_switch_context(), preventing speculative loads into the TLB
> being tagged with the wrong ASID. So this is specific to the ARM CPUs
> behaviour w.r.t. speculative TLB loads, it may not be the case (yet) for
> your architecture.

The most important thing is that TLBI ALL occurs between
"asid_maps[P1] = A2" and "TTBR0.BADDR = T2", then speculative
execution after TLBI which access to user space code/data will result
in a valid asid entry which re-filled into the TLB by PTW.

A similar problem should exist if C-SKY ISA supports SMT. Although the
C-SKY kernel prohibits the kernel from speculating on user space code
directly, ld/st can access user space memory in csky kernel mode.
Therefore, a similar problem occurs when it speculatively executes
copy_from / to_user codes in that window.

RISC-V ISA has a SUM setting bit that prevents the kernel from
speculating access to user space. So this problem has been bypassed
from the design.

I saw arm64 to prevent speculation by temporarily setting TTBR0.el1 to
a zero page table. Is that used to prevent speculative execution user
space code or just prevent ld/st in copy_use_* ?
Catalin Marinas July 22, 2019, 4:38 p.m. UTC | #23
On Tue, Jul 16, 2019 at 11:31:27AM +0800, Guo Ren wrote:
> I saw arm64 to prevent speculation by temporarily setting TTBR0.el1 to
> a zero page table. Is that used to prevent speculative execution user
> space code or just prevent ld/st in copy_use_* ?

Only to prevent explicit ld/st from user. On ARMv8.1+, we don't normally
use the TTBR0 trick but rather disable user space access using the PAN
(privileged access never) feature. However, I don't think PAN disables
speculative accesses, only explicit loads/stores. Also, with ARMv8.2
Linux uses the LDTR/STTR instructions in copy_*_user() which don't need
to disable PAN explicitly.
Guo Ren Sept. 7, 2019, 11:52 p.m. UTC | #24
Thx Will,

On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org> wrote:
> > I'll keep my system use the same ASID for SMP + IOMMU :P
>
> You will want a separate allocator for that:
>
> https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.brucker@arm.com

Yes, it is hard to maintain ASID between IOMMU and CPUMMU or different
system, because it's difficult to synchronize the IO_ASID when the CPU
ASID is rollover.
But we could still use hardware broadcast TLB invalidation instruction
to uniformly manage the ASID and IO_ASID, or OTHER_ASID in our IOMMU.

Welcome to join our disscusion:
"Introduce an implementation of IOMMU in linux-riscv"
9 Sep 2019, 10:45 Jade-room-I&II (Corinthia Hotel Lisbon) RISC-V MC

--
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/
Will Deacon Sept. 12, 2019, 2:02 p.m. UTC | #25
On Sun, Sep 08, 2019 at 07:52:55AM +0800, Guo Ren wrote:
> On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org> wrote:
> > > I'll keep my system use the same ASID for SMP + IOMMU :P
> >
> > You will want a separate allocator for that:
> >
> > https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.brucker@arm.com
> 
> Yes, it is hard to maintain ASID between IOMMU and CPUMMU or different
> system, because it's difficult to synchronize the IO_ASID when the CPU
> ASID is rollover.
> But we could still use hardware broadcast TLB invalidation instruction
> to uniformly manage the ASID and IO_ASID, or OTHER_ASID in our IOMMU.

That's probably a bad idea, because you'll likely stall execution on the
CPU until the IOTLB has completed invalidation. In the case of ATS, I think
an endpoint ATC is permitted to take over a minute to respond. In reality, I
suspect the worst you'll ever see would be in the msec range, but that's
still an unacceptable period of time to hold a CPU.

> Welcome to join our disscusion:
> "Introduce an implementation of IOMMU in linux-riscv"
> 9 Sep 2019, 10:45 Jade-room-I&II (Corinthia Hotel Lisbon) RISC-V MC

I attended this session, but it unfortunately raised many more questions
than it answered.

Will
Guo Ren Sept. 12, 2019, 2:59 p.m. UTC | #26
Thx Will for reply.

On Thu, Sep 12, 2019 at 3:03 PM Will Deacon <will@kernel.org> wrote:
>
> On Sun, Sep 08, 2019 at 07:52:55AM +0800, Guo Ren wrote:
> > On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org> wrote:
> > > > I'll keep my system use the same ASID for SMP + IOMMU :P
> > >
> > > You will want a separate allocator for that:
> > >
> > > https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.brucker@arm.com
> >
> > Yes, it is hard to maintain ASID between IOMMU and CPUMMU or different
> > system, because it's difficult to synchronize the IO_ASID when the CPU
> > ASID is rollover.
> > But we could still use hardware broadcast TLB invalidation instruction
> > to uniformly manage the ASID and IO_ASID, or OTHER_ASID in our IOMMU.
>
> That's probably a bad idea, because you'll likely stall execution on the
> CPU until the IOTLB has completed invalidation. In the case of ATS, I think
> an endpoint ATC is permitted to take over a minute to respond. In reality, I
> suspect the worst you'll ever see would be in the msec range, but that's
> still an unacceptable period of time to hold a CPU.
Just as I've said in the session that IOTLB invalidate delay is
another topic, My main proposal is to introduce stage1.pgd and
stage2.pgd as address space identifiers between different TLB systems
based on vmid, asid. My last part of sildes will show you how to
translate stage1/2.pgd to as/vmid in PCI ATS system and the method
could work with SMMU-v3 and intel Vt-d. (It's regret for me there is
no time to show you the whole slides.)

In our light IOMMU implementation, there's no IOTLB invalidate delay
problem. Becasue IOMMU is very close to CPU MMU and interconnect's
delay is the same with SMP CPUs MMU (no PCI, VM supported).

To solve the problem, we could define a async mode in sfence.vma.b to
slove the problem and finished with per_cpu_irq/exception.
Guo Ren Sept. 14, 2019, 8:49 a.m. UTC | #27
Here is the presentation, any comments is welcome.

https://docs.google.com/presentation/d/1sc295JznVAfDIPieAqzjcyUkcHnNFQsK8FFqdoCY854/edit?usp=sharing

On Fri, Sep 13, 2019 at 3:13 PM Guo Ren <guoren@kernel.org> wrote:
>
> Another idea is seperate remote TLB invalidate into two instructions:
>
>  - sfence.vma.b.asyc
>  - sfence.vma.b.barrier // wait all async TLB invalidate operations finished for all harts.
>
> (I remember who mentioned me separate them into two instructions after session. Anup? Is the idea right ?)
>
> Actually, I never consider asyc TLB invalidate before, because current our light iommu did not need it.
>
> Thx all people attend the session :) Let's continue the talk.
>
>
> Guo Ren <guoren@kernel.org> 于 2019年9月12日周四 22:59写道:
>>
>> Thx Will for reply.
>>
>> On Thu, Sep 12, 2019 at 3:03 PM Will Deacon <will@kernel.org> wrote:
>> >
>> > On Sun, Sep 08, 2019 at 07:52:55AM +0800, Guo Ren wrote:
>> > > On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org> wrote:
>> > > > > I'll keep my system use the same ASID for SMP + IOMMU :P
>> > > >
>> > > > You will want a separate allocator for that:
>> > > >
>> > > > https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.brucker@arm.com
>> > >
>> > > Yes, it is hard to maintain ASID between IOMMU and CPUMMU or different
>> > > system, because it's difficult to synchronize the IO_ASID when the CPU
>> > > ASID is rollover.
>> > > But we could still use hardware broadcast TLB invalidation instruction
>> > > to uniformly manage the ASID and IO_ASID, or OTHER_ASID in our IOMMU.
>> >
>> > That's probably a bad idea, because you'll likely stall execution on the
>> > CPU until the IOTLB has completed invalidation. In the case of ATS, I think
>> > an endpoint ATC is permitted to take over a minute to respond. In reality, I
>> > suspect the worst you'll ever see would be in the msec range, but that's
>> > still an unacceptable period of time to hold a CPU.
>> Just as I've said in the session that IOTLB invalidate delay is
>> another topic, My main proposal is to introduce stage1.pgd and
>> stage2.pgd as address space identifiers between different TLB systems
>> based on vmid, asid. My last part of sildes will show you how to
>> translate stage1/2.pgd to as/vmid in PCI ATS system and the method
>> could work with SMMU-v3 and intel Vt-d. (It's regret for me there is
>> no time to show you the whole slides.)
>>
>> In our light IOMMU implementation, there's no IOTLB invalidate delay
>> problem. Becasue IOMMU is very close to CPU MMU and interconnect's
>> delay is the same with SMP CPUs MMU (no PCI, VM supported).
>>
>> To solve the problem, we could define a async mode in sfence.vma.b to
>> slove the problem and finished with per_cpu_irq/exception.
>>
>> --
>> Best Regards
>>  Guo Ren
>>
>> ML: https://lore.kernel.org/linux-csky/
Palmer Dabbelt Sept. 14, 2019, 2:01 p.m. UTC | #28
On Thu, 12 Sep 2019 07:02:56 PDT (-0700), will@kernel.org wrote:
> On Sun, Sep 08, 2019 at 07:52:55AM +0800, Guo Ren wrote:
>> On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org> wrote:
>> > > I'll keep my system use the same ASID for SMP + IOMMU :P
>> >
>> > You will want a separate allocator for that:
>> >
>> > https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.brucker@arm.com
>>
>> Yes, it is hard to maintain ASID between IOMMU and CPUMMU or different
>> system, because it's difficult to synchronize the IO_ASID when the CPU
>> ASID is rollover.
>> But we could still use hardware broadcast TLB invalidation instruction
>> to uniformly manage the ASID and IO_ASID, or OTHER_ASID in our IOMMU.
>
> That's probably a bad idea, because you'll likely stall execution on the
> CPU until the IOTLB has completed invalidation. In the case of ATS, I think
> an endpoint ATC is permitted to take over a minute to respond. In reality, I
> suspect the worst you'll ever see would be in the msec range, but that's
> still an unacceptable period of time to hold a CPU.
>
>> Welcome to join our disscusion:
>> "Introduce an implementation of IOMMU in linux-riscv"
>> 9 Sep 2019, 10:45 Jade-room-I&II (Corinthia Hotel Lisbon) RISC-V MC
>
> I attended this session, but it unfortunately raised many more questions
> than it answered.

Ya, we're a long way from figuring this out.
Anup Patel Sept. 15, 2019, 5:03 a.m. UTC | #29
> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org <linux-kernel-
> owner@vger.kernel.org> On Behalf Of Palmer Dabbelt
> Sent: Saturday, September 14, 2019 7:31 PM
> To: will@kernel.org
> Cc: guoren@kernel.org; Will Deacon <will.deacon@arm.com>;
> julien.thierry@arm.com; aou@eecs.berkeley.edu; james.morse@arm.com;
> Arnd Bergmann <arnd@arndb.de>; suzuki.poulose@arm.com;
> marc.zyngier@arm.com; catalin.marinas@arm.com; Anup Patel
> <Anup.Patel@wdc.com>; linux-kernel@vger.kernel.org;
> rppt@linux.ibm.com; Christoph Hellwig <hch@infradead.org>; Atish Patra
> <Atish.Patra@wdc.com>; julien.grall@arm.com; gary@garyguo.net; Paul
> Walmsley <paul.walmsley@sifive.com>; christoffer.dall@arm.com; linux-
> riscv@lists.infradead.org; kvmarm@lists.cs.columbia.edu; linux-arm-
> kernel@lists.infradead.org; iommu@lists.linux-foundation.org
> Subject: Re: [PATCH RFC 11/14] arm64: Move the ASID allocator code in a
> separate file
> 
> On Thu, 12 Sep 2019 07:02:56 PDT (-0700), will@kernel.org wrote:
> > On Sun, Sep 08, 2019 at 07:52:55AM +0800, Guo Ren wrote:
> >> On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org> wrote:
> >> > > I'll keep my system use the same ASID for SMP + IOMMU :P
> >> >
> >> > You will want a separate allocator for that:
> >> >
> >> > https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.bruck
> >> > er@arm.com
> >>
> >> Yes, it is hard to maintain ASID between IOMMU and CPUMMU or
> >> different system, because it's difficult to synchronize the IO_ASID
> >> when the CPU ASID is rollover.
> >> But we could still use hardware broadcast TLB invalidation
> >> instruction to uniformly manage the ASID and IO_ASID, or OTHER_ASID in
> our IOMMU.
> >
> > That's probably a bad idea, because you'll likely stall execution on
> > the CPU until the IOTLB has completed invalidation. In the case of
> > ATS, I think an endpoint ATC is permitted to take over a minute to
> > respond. In reality, I suspect the worst you'll ever see would be in
> > the msec range, but that's still an unacceptable period of time to hold a
> CPU.
> >
> >> Welcome to join our disscusion:
> >> "Introduce an implementation of IOMMU in linux-riscv"
> >> 9 Sep 2019, 10:45 Jade-room-I&II (Corinthia Hotel Lisbon) RISC-V MC
> >
> > I attended this session, but it unfortunately raised many more
> > questions than it answered.
> 
> Ya, we're a long way from figuring this out.

For everyone's reference, here is our first attempt at RISC-V ASID allocator:
http://archive.lwn.net:8080/linux-kernel/20190329045111.14040-1-anup.patel@wdc.com/T/#u

Regards,
Anup
Jean-Philippe Brucker Sept. 16, 2019, 12:57 p.m. UTC | #30
Hi,

On 13/09/2019 09:13, Guo Ren wrote:
> Another idea is seperate remote TLB invalidate into two instructions:
> 
>  - sfence.vma.b.asyc
>  - sfence.vma.b.barrier // wait all async TLB invalidate operations
> finished for all harts.

It's not clear to me how this helps, but I probably don't have the whole
picture. If you have a place where it is safe to wait for the barrier to
complete, why not do the whole invalidate there?

> (I remember who mentioned me separate them into two instructions after
> session. Anup? Is the idea right ?) 
> 
> Actually, I never consider asyc TLB invalidate before, because current our
> light iommu did not need it.
> 
> Thx all people attend the session :) Let's continue the talk. 
> 
> 
> Guo Ren <guoren@kernel.org <mailto:guoren@kernel.org>> 于 2019年9月12日周
> 四 22:59写道:
> 
>     Thx Will for reply.
> 
>     On Thu, Sep 12, 2019 at 3:03 PM Will Deacon <will@kernel.org
>     <mailto:will@kernel.org>> wrote:
>     >
>     > On Sun, Sep 08, 2019 at 07:52:55AM +0800, Guo Ren wrote:
>     > > On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org
>     <mailto:will@kernel.org>> wrote:
>     > > > > I'll keep my system use the same ASID for SMP + IOMMU :P
>     > > >
>     > > > You will want a separate allocator for that:
>     > > >
>     > > >
>     https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.brucker@arm.com
>     > >
>     > > Yes, it is hard to maintain ASID between IOMMU and CPUMMU or different
>     > > system, because it's difficult to synchronize the IO_ASID when the CPU
>     > > ASID is rollover.
>     > > But we could still use hardware broadcast TLB invalidation instruction
>     > > to uniformly manage the ASID and IO_ASID, or OTHER_ASID in our IOMMU.
>     >
>     > That's probably a bad idea, because you'll likely stall execution on the
>     > CPU until the IOTLB has completed invalidation. In the case of ATS,
>     I think
>     > an endpoint ATC is permitted to take over a minute to respond. In
>     reality, I
>     > suspect the worst you'll ever see would be in the msec range, but that's
>     > still an unacceptable period of time to hold a CPU.
>     Just as I've said in the session that IOTLB invalidate delay is
>     another topic, My main proposal is to introduce stage1.pgd and
>     stage2.pgd as address space identifiers between different TLB systems
>     based on vmid, asid. My last part of sildes will show you how to
>     translate stage1/2.pgd to as/vmid in PCI ATS system and the method
>     could work with SMMU-v3 and intel Vt-d. (It's regret for me there is
>     no time to show you the whole slides.)
> 
>     In our light IOMMU implementation, there's no IOTLB invalidate delay
>     problem. Becasue IOMMU is very close to CPU MMU and interconnect's
>     delay is the same with SMP CPUs MMU (no PCI, VM supported).
> 
>     To solve the problem, we could define a async mode in sfence.vma.b to
>     slove the problem and finished with per_cpu_irq/exception.

The solution I had to this problem is pinning the ASID [1] used by the
IOMMU, to prevent the CPU from recycling the ASID on rollover. This way
the CPU doesn't have to wait for IOMMU invalidations to complete, when
scheduling a task that might not even have anything to do with the IOMMU.

In the Arm SMMU, ASID and IOASID (PASID) are separate identifiers. IOASID
indexes an entry in the context descriptor table, which contains the ASID.
So with unpinned shared ASID you don't need to invalidate the ATC on
rollover, since the IOASID doesn't change, but you do need to modify the
context descriptor and invalidate cached versions of it.

Once you have pinned ASIDs, you could also declare that IOASID = ASID. I
don't remember finding an argument to strictly forbid it, even though ASID
and IOASID have different sizes on Arm (respectively 8/16 and 20 bits).

Thanks,
Jean

[1]
https://lore.kernel.org/linux-iommu/20180511190641.23008-17-jean-philippe.brucker@arm.com/
Will Deacon Sept. 16, 2019, 6:18 p.m. UTC | #31
On Sun, Sep 15, 2019 at 05:03:38AM +0000, Anup Patel wrote:
> 
> 
> > -----Original Message-----
> > From: linux-kernel-owner@vger.kernel.org <linux-kernel-
> > owner@vger.kernel.org> On Behalf Of Palmer Dabbelt
> > Sent: Saturday, September 14, 2019 7:31 PM
> > To: will@kernel.org
> > Cc: guoren@kernel.org; Will Deacon <will.deacon@arm.com>;
> > julien.thierry@arm.com; aou@eecs.berkeley.edu; james.morse@arm.com;
> > Arnd Bergmann <arnd@arndb.de>; suzuki.poulose@arm.com;
> > marc.zyngier@arm.com; catalin.marinas@arm.com; Anup Patel
> > <Anup.Patel@wdc.com>; linux-kernel@vger.kernel.org;
> > rppt@linux.ibm.com; Christoph Hellwig <hch@infradead.org>; Atish Patra
> > <Atish.Patra@wdc.com>; julien.grall@arm.com; gary@garyguo.net; Paul
> > Walmsley <paul.walmsley@sifive.com>; christoffer.dall@arm.com; linux-
> > riscv@lists.infradead.org; kvmarm@lists.cs.columbia.edu; linux-arm-
> > kernel@lists.infradead.org; iommu@lists.linux-foundation.org
> > Subject: Re: [PATCH RFC 11/14] arm64: Move the ASID allocator code in a
> > separate file
> > 
> > On Thu, 12 Sep 2019 07:02:56 PDT (-0700), will@kernel.org wrote:
> > > On Sun, Sep 08, 2019 at 07:52:55AM +0800, Guo Ren wrote:
> > >> On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org> wrote:
> > >> > > I'll keep my system use the same ASID for SMP + IOMMU :P
> > >> >
> > >> > You will want a separate allocator for that:
> > >> >
> > >> > https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.bruck
> > >> > er@arm.com
> > >>
> > >> Yes, it is hard to maintain ASID between IOMMU and CPUMMU or
> > >> different system, because it's difficult to synchronize the IO_ASID
> > >> when the CPU ASID is rollover.
> > >> But we could still use hardware broadcast TLB invalidation
> > >> instruction to uniformly manage the ASID and IO_ASID, or OTHER_ASID in
> > our IOMMU.
> > >
> > > That's probably a bad idea, because you'll likely stall execution on
> > > the CPU until the IOTLB has completed invalidation. In the case of
> > > ATS, I think an endpoint ATC is permitted to take over a minute to
> > > respond. In reality, I suspect the worst you'll ever see would be in
> > > the msec range, but that's still an unacceptable period of time to hold a
> > CPU.
> > >
> > >> Welcome to join our disscusion:
> > >> "Introduce an implementation of IOMMU in linux-riscv"
> > >> 9 Sep 2019, 10:45 Jade-room-I&II (Corinthia Hotel Lisbon) RISC-V MC
> > >
> > > I attended this session, but it unfortunately raised many more
> > > questions than it answered.
> > 
> > Ya, we're a long way from figuring this out.
> 
> For everyone's reference, here is our first attempt at RISC-V ASID allocator:
> http://archive.lwn.net:8080/linux-kernel/20190329045111.14040-1-anup.patel@wdc.com/T/#u

With a reply stating that the patch "absolutely does not work" ;)

What exactly do you want people to do with that? It's an awful lot of effort
to review this sort of stuff and given that Guo Ren is talking about sharing
page tables between the CPU and an accelerator, maybe you're better off
stabilising Linux for the platforms that you can actually test rather than
getting so far ahead of yourselves that you end up with a bunch of wasted
work on patches that probably won't get merged any time soon.

Seriously, they say "walk before you can run", but this is more "crawl
before you can fly". What's the rush?

Will
Palmer Dabbelt Sept. 16, 2019, 6:28 p.m. UTC | #32
On Mon, 16 Sep 2019 11:18:00 PDT (-0700), will@kernel.org wrote:
> On Sun, Sep 15, 2019 at 05:03:38AM +0000, Anup Patel wrote:
>>
>>
>> > -----Original Message-----
>> > From: linux-kernel-owner@vger.kernel.org <linux-kernel-
>> > owner@vger.kernel.org> On Behalf Of Palmer Dabbelt
>> > Sent: Saturday, September 14, 2019 7:31 PM
>> > To: will@kernel.org
>> > Cc: guoren@kernel.org; Will Deacon <will.deacon@arm.com>;
>> > julien.thierry@arm.com; aou@eecs.berkeley.edu; james.morse@arm.com;
>> > Arnd Bergmann <arnd@arndb.de>; suzuki.poulose@arm.com;
>> > marc.zyngier@arm.com; catalin.marinas@arm.com; Anup Patel
>> > <Anup.Patel@wdc.com>; linux-kernel@vger.kernel.org;
>> > rppt@linux.ibm.com; Christoph Hellwig <hch@infradead.org>; Atish Patra
>> > <Atish.Patra@wdc.com>; julien.grall@arm.com; gary@garyguo.net; Paul
>> > Walmsley <paul.walmsley@sifive.com>; christoffer.dall@arm.com; linux-
>> > riscv@lists.infradead.org; kvmarm@lists.cs.columbia.edu; linux-arm-
>> > kernel@lists.infradead.org; iommu@lists.linux-foundation.org
>> > Subject: Re: [PATCH RFC 11/14] arm64: Move the ASID allocator code in a
>> > separate file
>> >
>> > On Thu, 12 Sep 2019 07:02:56 PDT (-0700), will@kernel.org wrote:
>> > > On Sun, Sep 08, 2019 at 07:52:55AM +0800, Guo Ren wrote:
>> > >> On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org> wrote:
>> > >> > > I'll keep my system use the same ASID for SMP + IOMMU :P
>> > >> >
>> > >> > You will want a separate allocator for that:
>> > >> >
>> > >> > https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.bruck
>> > >> > er@arm.com
>> > >>
>> > >> Yes, it is hard to maintain ASID between IOMMU and CPUMMU or
>> > >> different system, because it's difficult to synchronize the IO_ASID
>> > >> when the CPU ASID is rollover.
>> > >> But we could still use hardware broadcast TLB invalidation
>> > >> instruction to uniformly manage the ASID and IO_ASID, or OTHER_ASID in
>> > our IOMMU.
>> > >
>> > > That's probably a bad idea, because you'll likely stall execution on
>> > > the CPU until the IOTLB has completed invalidation. In the case of
>> > > ATS, I think an endpoint ATC is permitted to take over a minute to
>> > > respond. In reality, I suspect the worst you'll ever see would be in
>> > > the msec range, but that's still an unacceptable period of time to hold a
>> > CPU.
>> > >
>> > >> Welcome to join our disscusion:
>> > >> "Introduce an implementation of IOMMU in linux-riscv"
>> > >> 9 Sep 2019, 10:45 Jade-room-I&II (Corinthia Hotel Lisbon) RISC-V MC
>> > >
>> > > I attended this session, but it unfortunately raised many more
>> > > questions than it answered.
>> >
>> > Ya, we're a long way from figuring this out.
>>
>> For everyone's reference, here is our first attempt at RISC-V ASID allocator:
>> http://archive.lwn.net:8080/linux-kernel/20190329045111.14040-1-anup.patel@wdc.com/T/#u
>
> With a reply stating that the patch "absolutely does not work" ;)
>
> What exactly do you want people to do with that? It's an awful lot of effort
> to review this sort of stuff and given that Guo Ren is talking about sharing
> page tables between the CPU and an accelerator, maybe you're better off
> stabilising Linux for the platforms that you can actually test rather than
> getting so far ahead of yourselves that you end up with a bunch of wasted
> work on patches that probably won't get merged any time soon.
>
> Seriously, they say "walk before you can run", but this is more "crawl
> before you can fly". What's the rush?

I agree, and I think I've been pretty clear here: we're not merging this ASID 
stuff until we have a platform we can test on, particularly as the platforms we 
have now already need some wacky hacks around TLB flushing that we haven't 
gotten to the bottom of.

> Will
Anup Patel Sept. 17, 2019, 3:42 a.m. UTC | #33
> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org <linux-kernel-
> owner@vger.kernel.org> On Behalf Of Will Deacon
> Sent: Monday, September 16, 2019 11:48 PM
> To: Anup Patel <Anup.Patel@wdc.com>
> Cc: Palmer Dabbelt <palmer@sifive.com>; guoren@kernel.org; Will Deacon
> <will.deacon@arm.com>; julien.thierry@arm.com; aou@eecs.berkeley.edu;
> james.morse@arm.com; Arnd Bergmann <arnd@arndb.de>;
> suzuki.poulose@arm.com; marc.zyngier@arm.com;
> catalin.marinas@arm.com; linux-kernel@vger.kernel.org;
> rppt@linux.ibm.com; Christoph Hellwig <hch@infradead.org>; Atish Patra
> <Atish.Patra@wdc.com>; julien.grall@arm.com; gary@garyguo.net; Paul
> Walmsley <paul.walmsley@sifive.com>; christoffer.dall@arm.com; linux-
> riscv@lists.infradead.org; kvmarm@lists.cs.columbia.edu; linux-arm-
> kernel@lists.infradead.org; iommu@lists.linux-foundation.org
> Subject: Re: [PATCH RFC 11/14] arm64: Move the ASID allocator code in a
> separate file
> 
> On Sun, Sep 15, 2019 at 05:03:38AM +0000, Anup Patel wrote:
> >
> >
> > > -----Original Message-----
> > > From: linux-kernel-owner@vger.kernel.org <linux-kernel-
> > > owner@vger.kernel.org> On Behalf Of Palmer Dabbelt
> > > Sent: Saturday, September 14, 2019 7:31 PM
> > > To: will@kernel.org
> > > Cc: guoren@kernel.org; Will Deacon <will.deacon@arm.com>;
> > > julien.thierry@arm.com; aou@eecs.berkeley.edu;
> james.morse@arm.com;
> > > Arnd Bergmann <arnd@arndb.de>; suzuki.poulose@arm.com;
> > > marc.zyngier@arm.com; catalin.marinas@arm.com; Anup Patel
> > > <Anup.Patel@wdc.com>; linux-kernel@vger.kernel.org;
> > > rppt@linux.ibm.com; Christoph Hellwig <hch@infradead.org>; Atish
> > > Patra <Atish.Patra@wdc.com>; julien.grall@arm.com; gary@garyguo.net;
> > > Paul Walmsley <paul.walmsley@sifive.com>; christoffer.dall@arm.com;
> > > linux- riscv@lists.infradead.org; kvmarm@lists.cs.columbia.edu;
> > > linux-arm- kernel@lists.infradead.org;
> > > iommu@lists.linux-foundation.org
> > > Subject: Re: [PATCH RFC 11/14] arm64: Move the ASID allocator code
> > > in a separate file
> > >
> > > On Thu, 12 Sep 2019 07:02:56 PDT (-0700), will@kernel.org wrote:
> > > > On Sun, Sep 08, 2019 at 07:52:55AM +0800, Guo Ren wrote:
> > > >> On Mon, Jun 24, 2019 at 6:40 PM Will Deacon <will@kernel.org>
> wrote:
> > > >> > > I'll keep my system use the same ASID for SMP + IOMMU :P
> > > >> >
> > > >> > You will want a separate allocator for that:
> > > >> >
> > > >> > https://lkml.kernel.org/r/20190610184714.6786-2-jean-philippe.b
> > > >> > ruck
> > > >> > er@arm.com
> > > >>
> > > >> Yes, it is hard to maintain ASID between IOMMU and CPUMMU or
> > > >> different system, because it's difficult to synchronize the
> > > >> IO_ASID when the CPU ASID is rollover.
> > > >> But we could still use hardware broadcast TLB invalidation
> > > >> instruction to uniformly manage the ASID and IO_ASID, or
> > > >> OTHER_ASID in
> > > our IOMMU.
> > > >
> > > > That's probably a bad idea, because you'll likely stall execution
> > > > on the CPU until the IOTLB has completed invalidation. In the case
> > > > of ATS, I think an endpoint ATC is permitted to take over a minute
> > > > to respond. In reality, I suspect the worst you'll ever see would
> > > > be in the msec range, but that's still an unacceptable period of
> > > > time to hold a
> > > CPU.
> > > >
> > > >> Welcome to join our disscusion:
> > > >> "Introduce an implementation of IOMMU in linux-riscv"
> > > >> 9 Sep 2019, 10:45 Jade-room-I&II (Corinthia Hotel Lisbon) RISC-V
> > > >> MC
> > > >
> > > > I attended this session, but it unfortunately raised many more
> > > > questions than it answered.
> > >
> > > Ya, we're a long way from figuring this out.
> >
> > For everyone's reference, here is our first attempt at RISC-V ASID allocator:
> > http://archive.lwn.net:8080/linux-kernel/20190329045111.14040-1-anup.p
> > atel@wdc.com/T/#u
> 
> With a reply stating that the patch "absolutely does not work" ;)

This patch was tested on existing HW (which does not have ASID implementation)
and tested on QEMU (which has very simplistic Implementation of ASID).

When I asked Gary Guo about way to get access to their HW (in same patch
email thread), I did not get any reply. After so many months passed, I now
doubt the his comment "absolutely does not work".

> 
> What exactly do you want people to do with that? It's an awful lot of effort to
> review this sort of stuff and given that Guo Ren is talking about sharing page
> tables between the CPU and an accelerator, maybe you're better off
> stabilising Linux for the platforms that you can actually test rather than
> getting so far ahead of yourselves that you end up with a bunch of wasted
> work on patches that probably won't get merged any time soon.

The intention of the ASID patch was to encourage RISC-V implementations
having ASID in HW and also ensure that things don't break on existing HW.

I don't see our efforts being wasted in trying to make Linux RISC-V feature
complete and encouraging more feature rich RISC-V CPUs.

Delays in merging patches are fine as long as people have something to try
on their RISC-V CPU implementations.

> 
> Seriously, they say "walk before you can run", but this is more "crawl before
> you can fly". What's the rush?
> 
> Will

Regards,
Anup
Guo Ren Sept. 19, 2019, 1:07 p.m. UTC | #34
Hi,

On Mon, Sep 16, 2019 at 8:57 PM Jean-Philippe Brucker
<jean-philippe@linaro.org> wrote:
> On 13/09/2019 09:13, Guo Ren wrote:
> > Another idea is seperate remote TLB invalidate into two instructions:
> >
> >  - sfence.vma.b.asyc
> >  - sfence.vma.b.barrier // wait all async TLB invalidate operations
> > finished for all harts.
>
> It's not clear to me how this helps, but I probably don't have the whole
> picture. If you have a place where it is safe to wait for the barrier to
> complete, why not do the whole invalidate there?
>
> > (I remember who mentioned me separate them into two instructions after
> > session. Anup? Is the idea right ?)
Forget it, I still use irq signal in my formal proposal [1]. I also
couldn't image the whole picture :P


> >     To solve the problem, we could define a async mode in sfence.vma.b to
> >     slove the problem and finished with per_cpu_irq/exception.
>
> The solution I had to this problem is pinning the ASID [1] used by the
> IOMMU, to prevent the CPU from recycling the ASID on rollover. This way
> the CPU doesn't have to wait for IOMMU invalidations to complete, when
> scheduling a task that might not even have anything to do with the IOMMU.
>

> In the Arm SMMU, ASID and IOASID (PASID) are separate identifiers. IOASID
> indexes an entry in the context descriptor table, which contains the ASID.
> So with unpinned shared ASID you don't need to invalidate the ATC on
> rollover, since the IOASID doesn't change, but you do need to modify the
> context descriptor and invalidate cached versions of it.
The terminology confused me a lot. I perfer use PASID for IOMMU and
ASID is for CPU.
Arm's entry of the context descriptor table contains a "IOASID"

IOASID != ASID for CPU_TLB and IOMMU_TLB.

When you say "since the IOASID doesn't change",Is it PASID or my IOASID ? -_*!
PASID in PCI-sig was used to determine transfer address space.
For intel, the entry which is indexed by PASID also contain S1/S2.PGD
and DID(VMID).
For arm, the entry which is indexed by PASID only contain S1.PGD and
IOASID. Compare to Intel Vt-d Scalable mode, arm's design can't
support PCI Virtual Function.

>
> Once you have pinned ASIDs, you could also declare that IOASID = ASID. I
> don't remember finding an argument to strictly forbid it, even though ASID
> and IOASID have different sizes on Arm (respectively 8/16 and 20 bits).
ASID and IOASID are hard to keep the same between CPU system and IOMMU
system. So I introduce S1/S2.PGD.PPN as a bridge between CPUs and
IOMMUs.
See my proposal [1]

1: https://lore.kernel.org/linux-csky/1568896556-28769-1-git-send-email-guoren@kernel.org/T/#u
Guo Ren Sept. 19, 2019, 1:36 p.m. UTC | #35
Hi,

On Tue, Sep 17, 2019 at 11:42 AM Anup Patel <Anup.Patel@wdc.com> wrote:

> >
> > With a reply stating that the patch "absolutely does not work" ;)
>
> This patch was tested on existing HW (which does not have ASID implementation)
> and tested on QEMU (which has very simplistic Implementation of ASID).
>
> When I asked Gary Guo about way to get access to their HW (in same patch
> email thread), I did not get any reply. After so many months passed, I now
> doubt the his comment "absolutely does not work".
> >
> > What exactly do you want people to do with that? It's an awful lot of effort to
> > review this sort of stuff and given that Guo Ren is talking about sharing page
> > tables between the CPU and an accelerator, maybe you're better off
> > stabilising Linux for the platforms that you can actually test rather than
> > getting so far ahead of yourselves that you end up with a bunch of wasted
> > work on patches that probably won't get merged any time soon.
>
> The intention of the ASID patch was to encourage RISC-V implementations
> having ASID in HW and also ensure that things don't break on existing HW.
>
> I don't see our efforts being wasted in trying to make Linux RISC-V feature
> complete and encouraging more feature rich RISC-V CPUs.
>
> Delays in merging patches are fine as long as people have something to try
> on their RISC-V CPU implementations.
>
I'm the supporter of that patch:
http://archive.lwn.net:8080/linux-kernel/20190329045111.14040-1-anup.patel@wdc.com/T/#u

Because it implicit hw broadcast tlb invalidation optimization.

Honestly it's not suitable for remote tlb flush with software IPI, but
it's still much better than current RISC-V's.

I'll try it on our hardware: 910. wait a moment :)
Jean-Philippe Brucker Sept. 19, 2019, 3:18 p.m. UTC | #36
On Thu, Sep 19, 2019 at 09:07:15PM +0800, Guo Ren wrote:
> > The solution I had to this problem is pinning the ASID [1] used by the
> > IOMMU, to prevent the CPU from recycling the ASID on rollover. This way
> > the CPU doesn't have to wait for IOMMU invalidations to complete, when
> > scheduling a task that might not even have anything to do with the IOMMU.
> >
> 
> > In the Arm SMMU, ASID and IOASID (PASID) are separate identifiers. IOASID
> > indexes an entry in the context descriptor table, which contains the ASID.
> > So with unpinned shared ASID you don't need to invalidate the ATC on
> > rollover, since the IOASID doesn't change, but you do need to modify the
> > context descriptor and invalidate cached versions of it.
> The terminology confused me a lot. I perfer use PASID for IOMMU and
> ASID is for CPU.
> Arm's entry of the context descriptor table contains a "IOASID"

The terminology I've been using so far is different:
* IOASID is PASID
* The entry in the context descriptor table contains an ASID, which
  is either "shared" with CPUs or "private" to the SMMU (the SMMU spec
  says "shared" or "non-shared").
* So the CPU and SMMU TLBs use ASIDs, and the PCI ATC uses IOASID

> IOASID != ASID for CPU_TLB and IOMMU_TLB.
> 
> When you say "since the IOASID doesn't change",Is it PASID or my IOASID ? -_*!

I was talking about PASID. Maybe we can drop "IOASID" and talk only
about ASID and PASID :)

> PASID in PCI-sig was used to determine transfer address space.
> For intel, the entry which is indexed by PASID also contain S1/S2.PGD
> and DID(VMID).
> For arm, the entry which is indexed by PASID only contain S1.PGD and
> IOASID. Compare to Intel Vt-d Scalable mode, arm's design can't
> support PCI Virtual Function.

The SMMU does support PCI Virtual Function - an hypervisor can assign a
VF to a guest, and let that guest partition the VF into smaller contexts
by using PASID.  What it can't support is assigning partitions of a PCI
function (VF or PF) to multiple Virtual Machines, since there is a
single S2 PGD per function (in the Stream Table Entry), rather than one
S2 PGD per PASID context.

Thanks,
Jean

> > Once you have pinned ASIDs, you could also declare that IOASID = ASID. I
> > don't remember finding an argument to strictly forbid it, even though ASID
> > and IOASID have different sizes on Arm (respectively 8/16 and 20 bits).
> ASID and IOASID are hard to keep the same between CPU system and IOMMU
> system. So I introduce S1/S2.PGD.PPN as a bridge between CPUs and
> IOMMUs.
> See my proposal [1]
> 
> 1: https://lore.kernel.org/linux-csky/1568896556-28769-1-git-send-email-guoren@kernel.org/T/#u
> -- 
> Best Regards
>  Guo Ren
> 
> ML: https://lore.kernel.org/linux-csky/
Guo Ren Sept. 20, 2019, 12:07 a.m. UTC | #37
On Thu, Sep 19, 2019 at 11:18 PM Jean-Philippe Brucker
<jean-philippe@linaro.org> wrote:

>
> The SMMU does support PCI Virtual Function - an hypervisor can assign a
> VF to a guest, and let that guest partition the VF into smaller contexts
> by using PASID.  What it can't support is assigning partitions of a PCI
> function (VF or PF) to multiple Virtual Machines, since there is a
> single S2 PGD per function (in the Stream Table Entry), rather than one
> S2 PGD per PASID context.
>
In my concept, the two sentences "The SMMU does support PCI Virtual
Functio" v.s. "What it can't support is assigning partitions of a PCI
function (VF or PF) to multiple Virtual Machines" are conflict and I
don't want to play naming game :)
Jean-Philippe Brucker Sept. 20, 2019, 7:18 a.m. UTC | #38
On Fri, Sep 20, 2019 at 08:07:38AM +0800, Guo Ren wrote:
> On Thu, Sep 19, 2019 at 11:18 PM Jean-Philippe Brucker
> <jean-philippe@linaro.org> wrote:
> 
> >
> > The SMMU does support PCI Virtual Function - an hypervisor can assign a
> > VF to a guest, and let that guest partition the VF into smaller contexts
> > by using PASID.  What it can't support is assigning partitions of a PCI
> > function (VF or PF) to multiple Virtual Machines, since there is a
> > single S2 PGD per function (in the Stream Table Entry), rather than one
> > S2 PGD per PASID context.
> >
> In my concept, the two sentences "The SMMU does support PCI Virtual
> Functio" v.s. "What it can't support is assigning partitions of a PCI
> function (VF or PF) to multiple Virtual Machines" are conflict and I
> don't want to play naming game :)

That's fine. But to prevent the spread of misinformation: Arm SMMU
supports PCI Virtual Functions.

Thanks,
Jean
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/asid.h b/arch/arm64/include/asm/asid.h
new file mode 100644
index 000000000000..bb62b587f37f
--- /dev/null
+++ b/arch/arm64/include/asm/asid.h
@@ -0,0 +1,77 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_ASM_ASID_H
+#define __ASM_ASM_ASID_H
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+
+struct asid_info
+{
+	atomic64_t	generation;
+	unsigned long	*map;
+	atomic64_t __percpu	*active;
+	u64 __percpu		*reserved;
+	u32			bits;
+	/* Lock protecting the structure */
+	raw_spinlock_t		lock;
+	/* Which CPU requires context flush on next call */
+	cpumask_t		flush_pending;
+	/* Number of ASID allocated by context (shift value) */
+	unsigned int		ctxt_shift;
+	/* Callback to locally flush the context. */
+	void			(*flush_cpu_ctxt_cb)(void);
+};
+
+#define NUM_ASIDS(info)			(1UL << ((info)->bits))
+#define NUM_CTXT_ASIDS(info)		(NUM_ASIDS(info) >> (info)->ctxt_shift)
+
+#define active_asid(info, cpu)	*per_cpu_ptr((info)->active, cpu)
+
+void asid_new_context(struct asid_info *info, atomic64_t *pasid,
+		      unsigned int cpu);
+
+/*
+ * Check the ASID is still valid for the context. If not generate a new ASID.
+ *
+ * @pasid: Pointer to the current ASID batch
+ * @cpu: current CPU ID. Must have been acquired throught get_cpu()
+ */
+static inline void asid_check_context(struct asid_info *info,
+				      atomic64_t *pasid, unsigned int cpu)
+{
+	u64 asid, old_active_asid;
+
+	asid = atomic64_read(pasid);
+
+	/*
+	 * The memory ordering here is subtle.
+	 * If our active_asid is non-zero and the ASID matches the current
+	 * generation, then we update the active_asid entry with a relaxed
+	 * cmpxchg. Racing with a concurrent rollover means that either:
+	 *
+	 * - We get a zero back from the cmpxchg and end up waiting on the
+	 *   lock. Taking the lock synchronises with the rollover and so
+	 *   we are forced to see the updated generation.
+	 *
+	 * - We get a valid ASID back from the cmpxchg, which means the
+	 *   relaxed xchg in flush_context will treat us as reserved
+	 *   because atomic RmWs are totally ordered for a given location.
+	 */
+	old_active_asid = atomic64_read(&active_asid(info, cpu));
+	if (old_active_asid &&
+	    !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
+	    atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
+				     old_active_asid, asid))
+		return;
+
+	asid_new_context(info, pasid, cpu);
+}
+
+int asid_allocator_init(struct asid_info *info,
+			u32 bits, unsigned int asid_per_ctxt,
+			void (*flush_cpu_ctxt_cb)(void));
+
+#endif
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 5540a1638baf..720df5ee2aa2 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -5,6 +5,8 @@  lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
 		   strchr.o strrchr.o tishift.o
 
+lib-y		+= asid.o
+
 ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
 CFLAGS_REMOVE_xor-neon.o	+= -mgeneral-regs-only
diff --git a/arch/arm64/lib/asid.c b/arch/arm64/lib/asid.c
new file mode 100644
index 000000000000..72b71bfb32be
--- /dev/null
+++ b/arch/arm64/lib/asid.c
@@ -0,0 +1,185 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic ASID allocator.
+ *
+ * Based on arch/arm/mm/context.c
+ *
+ * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved.
+ * Copyright (C) 2012 ARM Ltd.
+ */
+
+#include <linux/slab.h>
+
+#include <asm/asid.h>
+
+#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
+
+#define ASID_MASK(info)			(~GENMASK((info)->bits - 1, 0))
+#define ASID_FIRST_VERSION(info)	(1UL << ((info)->bits))
+
+#define asid2idx(info, asid)		(((asid) & ~ASID_MASK(info)) >> (info)->ctxt_shift)
+#define idx2asid(info, idx)		(((idx) << (info)->ctxt_shift) & ~ASID_MASK(info))
+
+static void flush_context(struct asid_info *info)
+{
+	int i;
+	u64 asid;
+
+	/* Update the list of reserved ASIDs and the ASID bitmap. */
+	bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
+
+	for_each_possible_cpu(i) {
+		asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
+		/*
+		 * If this CPU has already been through a
+		 * rollover, but hasn't run another task in
+		 * the meantime, we must preserve its reserved
+		 * ASID, as this is the only trace we have of
+		 * the process it is still running.
+		 */
+		if (asid == 0)
+			asid = reserved_asid(info, i);
+		__set_bit(asid2idx(info, asid), info->map);
+		reserved_asid(info, i) = asid;
+	}
+
+	/*
+	 * Queue a TLB invalidation for each CPU to perform on next
+	 * context-switch
+	 */
+	cpumask_setall(&info->flush_pending);
+}
+
+static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
+				       u64 newasid)
+{
+	int cpu;
+	bool hit = false;
+
+	/*
+	 * Iterate over the set of reserved ASIDs looking for a match.
+	 * If we find one, then we can update our mm to use newasid
+	 * (i.e. the same ASID in the current generation) but we can't
+	 * exit the loop early, since we need to ensure that all copies
+	 * of the old ASID are updated to reflect the mm. Failure to do
+	 * so could result in us missing the reserved ASID in a future
+	 * generation.
+	 */
+	for_each_possible_cpu(cpu) {
+		if (reserved_asid(info, cpu) == asid) {
+			hit = true;
+			reserved_asid(info, cpu) = newasid;
+		}
+	}
+
+	return hit;
+}
+
+static u64 new_context(struct asid_info *info, atomic64_t *pasid)
+{
+	static u32 cur_idx = 1;
+	u64 asid = atomic64_read(pasid);
+	u64 generation = atomic64_read(&info->generation);
+
+	if (asid != 0) {
+		u64 newasid = generation | (asid & ~ASID_MASK(info));
+
+		/*
+		 * If our current ASID was active during a rollover, we
+		 * can continue to use it and this was just a false alarm.
+		 */
+		if (check_update_reserved_asid(info, asid, newasid))
+			return newasid;
+
+		/*
+		 * We had a valid ASID in a previous life, so try to re-use
+		 * it if possible.
+		 */
+		if (!__test_and_set_bit(asid2idx(info, asid), info->map))
+			return newasid;
+	}
+
+	/*
+	 * Allocate a free ASID. If we can't find one, take a note of the
+	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
+	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
+	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
+	 * pairs.
+	 */
+	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
+	if (asid != NUM_CTXT_ASIDS(info))
+		goto set_asid;
+
+	/* We're out of ASIDs, so increment the global generation count */
+	generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
+						 &info->generation);
+	flush_context(info);
+
+	/* We have more ASIDs than CPUs, so this will always succeed */
+	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
+
+set_asid:
+	__set_bit(asid, info->map);
+	cur_idx = asid;
+	return idx2asid(info, asid) | generation;
+}
+
+/*
+ * Generate a new ASID for the context.
+ *
+ * @pasid: Pointer to the current ASID batch allocated. It will be updated
+ * with the new ASID batch.
+ * @cpu: current CPU ID. Must have been acquired through get_cpu()
+ */
+void asid_new_context(struct asid_info *info, atomic64_t *pasid,
+		      unsigned int cpu)
+{
+	unsigned long flags;
+	u64 asid;
+
+	raw_spin_lock_irqsave(&info->lock, flags);
+	/* Check that our ASID belongs to the current generation. */
+	asid = atomic64_read(pasid);
+	if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
+		asid = new_context(info, pasid);
+		atomic64_set(pasid, asid);
+	}
+
+	if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
+		info->flush_cpu_ctxt_cb();
+
+	atomic64_set(&active_asid(info, cpu), asid);
+	raw_spin_unlock_irqrestore(&info->lock, flags);
+}
+
+/*
+ * Initialize the ASID allocator
+ *
+ * @info: Pointer to the asid allocator structure
+ * @bits: Number of ASIDs available
+ * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
+ * allocated contiguously for a given context. This value should be a power of
+ * 2.
+ */
+int asid_allocator_init(struct asid_info *info,
+			u32 bits, unsigned int asid_per_ctxt,
+			void (*flush_cpu_ctxt_cb)(void))
+{
+	info->bits = bits;
+	info->ctxt_shift = ilog2(asid_per_ctxt);
+	info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
+	/*
+	 * Expect allocation after rollover to fail if we don't have at least
+	 * one more ASID than CPUs. ASID #0 is always reserved.
+	 */
+	WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
+	atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
+	info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
+			    sizeof(*info->map), GFP_KERNEL);
+	if (!info->map)
+		return -ENOMEM;
+
+	raw_spin_lock_init(&info->lock);
+
+	return 0;
+}
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 678a57b77c91..95ee7711a2ef 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -22,47 +22,22 @@ 
 #include <linux/slab.h>
 #include <linux/mm.h>
 
+#include <asm/asid.h>
 #include <asm/cpufeature.h>
 #include <asm/mmu_context.h>
 #include <asm/smp.h>
 #include <asm/tlbflush.h>
 
-struct asid_info
-{
-	atomic64_t	generation;
-	unsigned long	*map;
-	atomic64_t __percpu	*active;
-	u64 __percpu		*reserved;
-	u32			bits;
-	raw_spinlock_t		lock;
-	/* Which CPU requires context flush on next call */
-	cpumask_t		flush_pending;
-	/* Number of ASID allocated by context (shift value) */
-	unsigned int		ctxt_shift;
-	/* Callback to locally flush the context. */
-	void			(*flush_cpu_ctxt_cb)(void);
-} asid_info;
-
-#define active_asid(info, cpu)	*per_cpu_ptr((info)->active, cpu)
-#define reserved_asid(info, cpu) *per_cpu_ptr((info)->reserved, cpu)
-
 static DEFINE_PER_CPU(atomic64_t, active_asids);
 static DEFINE_PER_CPU(u64, reserved_asids);
 
-#define ASID_MASK(info)			(~GENMASK((info)->bits - 1, 0))
-#define NUM_ASIDS(info)			(1UL << ((info)->bits))
-
-#define ASID_FIRST_VERSION(info)	NUM_ASIDS(info)
-
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 #define ASID_PER_CONTEXT		2
 #else
 #define ASID_PER_CONTEXT		1
 #endif
 
-#define NUM_CTXT_ASIDS(info)		(NUM_ASIDS(info) >> (info)->ctxt_shift)
-#define asid2idx(info, asid)		(((asid) & ~ASID_MASK(info)) >> (info)->ctxt_shift)
-#define idx2asid(info, idx)		(((idx) << (info)->ctxt_shift) & ~ASID_MASK(info))
+struct asid_info asid_info;
 
 /* Get the ASIDBits supported by the current CPU */
 static u32 get_cpu_asid_bits(void)
@@ -102,178 +77,6 @@  void verify_cpu_asid_bits(void)
 	}
 }
 
-static void flush_context(struct asid_info *info)
-{
-	int i;
-	u64 asid;
-
-	/* Update the list of reserved ASIDs and the ASID bitmap. */
-	bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
-
-	for_each_possible_cpu(i) {
-		asid = atomic64_xchg_relaxed(&active_asid(info, i), 0);
-		/*
-		 * If this CPU has already been through a
-		 * rollover, but hasn't run another task in
-		 * the meantime, we must preserve its reserved
-		 * ASID, as this is the only trace we have of
-		 * the process it is still running.
-		 */
-		if (asid == 0)
-			asid = reserved_asid(info, i);
-		__set_bit(asid2idx(info, asid), info->map);
-		reserved_asid(info, i) = asid;
-	}
-
-	/*
-	 * Queue a TLB invalidation for each CPU to perform on next
-	 * context-switch
-	 */
-	cpumask_setall(&info->flush_pending);
-}
-
-static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
-				       u64 newasid)
-{
-	int cpu;
-	bool hit = false;
-
-	/*
-	 * Iterate over the set of reserved ASIDs looking for a match.
-	 * If we find one, then we can update our mm to use newasid
-	 * (i.e. the same ASID in the current generation) but we can't
-	 * exit the loop early, since we need to ensure that all copies
-	 * of the old ASID are updated to reflect the mm. Failure to do
-	 * so could result in us missing the reserved ASID in a future
-	 * generation.
-	 */
-	for_each_possible_cpu(cpu) {
-		if (reserved_asid(info, cpu) == asid) {
-			hit = true;
-			reserved_asid(info, cpu) = newasid;
-		}
-	}
-
-	return hit;
-}
-
-static u64 new_context(struct asid_info *info, atomic64_t *pasid)
-{
-	static u32 cur_idx = 1;
-	u64 asid = atomic64_read(pasid);
-	u64 generation = atomic64_read(&info->generation);
-
-	if (asid != 0) {
-		u64 newasid = generation | (asid & ~ASID_MASK(info));
-
-		/*
-		 * If our current ASID was active during a rollover, we
-		 * can continue to use it and this was just a false alarm.
-		 */
-		if (check_update_reserved_asid(info, asid, newasid))
-			return newasid;
-
-		/*
-		 * We had a valid ASID in a previous life, so try to re-use
-		 * it if possible.
-		 */
-		if (!__test_and_set_bit(asid2idx(info, asid), info->map))
-			return newasid;
-	}
-
-	/*
-	 * Allocate a free ASID. If we can't find one, take a note of the
-	 * currently active ASIDs and mark the TLBs as requiring flushes.  We
-	 * always count from ASID #2 (index 1), as we use ASID #0 when setting
-	 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
-	 * pairs.
-	 */
-	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), cur_idx);
-	if (asid != NUM_CTXT_ASIDS(info))
-		goto set_asid;
-
-	/* We're out of ASIDs, so increment the global generation count */
-	generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION(info),
-						 &info->generation);
-	flush_context(info);
-
-	/* We have more ASIDs than CPUs, so this will always succeed */
-	asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
-
-set_asid:
-	__set_bit(asid, info->map);
-	cur_idx = asid;
-	return idx2asid(info, asid) | generation;
-}
-
-static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
-			     unsigned int cpu);
-
-/*
- * Check the ASID is still valid for the context. If not generate a new ASID.
- *
- * @pasid: Pointer to the current ASID batch
- * @cpu: current CPU ID. Must have been acquired throught get_cpu()
- */
-static void asid_check_context(struct asid_info *info,
-			       atomic64_t *pasid, unsigned int cpu)
-{
-	u64 asid, old_active_asid;
-
-	asid = atomic64_read(pasid);
-
-	/*
-	 * The memory ordering here is subtle.
-	 * If our active_asid is non-zero and the ASID matches the current
-	 * generation, then we update the active_asid entry with a relaxed
-	 * cmpxchg. Racing with a concurrent rollover means that either:
-	 *
-	 * - We get a zero back from the cmpxchg and end up waiting on the
-	 *   lock. Taking the lock synchronises with the rollover and so
-	 *   we are forced to see the updated generation.
-	 *
-	 * - We get a valid ASID back from the cmpxchg, which means the
-	 *   relaxed xchg in flush_context will treat us as reserved
-	 *   because atomic RmWs are totally ordered for a given location.
-	 */
-	old_active_asid = atomic64_read(&active_asid(info, cpu));
-	if (old_active_asid &&
-	    !((asid ^ atomic64_read(&info->generation)) >> info->bits) &&
-	    atomic64_cmpxchg_relaxed(&active_asid(info, cpu),
-				     old_active_asid, asid))
-		return;
-
-	asid_new_context(info, pasid, cpu);
-}
-
-/*
- * Generate a new ASID for the context.
- *
- * @pasid: Pointer to the current ASID batch allocated. It will be updated
- * with the new ASID batch.
- * @cpu: current CPU ID. Must have been acquired through get_cpu()
- */
-static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
-			     unsigned int cpu)
-{
-	unsigned long flags;
-	u64 asid;
-
-	raw_spin_lock_irqsave(&info->lock, flags);
-	/* Check that our ASID belongs to the current generation. */
-	asid = atomic64_read(pasid);
-	if ((asid ^ atomic64_read(&info->generation)) >> info->bits) {
-		asid = new_context(info, pasid);
-		atomic64_set(pasid, asid);
-	}
-
-	if (cpumask_test_and_clear_cpu(cpu, &info->flush_pending))
-		info->flush_cpu_ctxt_cb();
-
-	atomic64_set(&active_asid(info, cpu), asid);
-	raw_spin_unlock_irqrestore(&info->lock, flags);
-}
-
 void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 {
 	if (system_supports_cnp())
@@ -305,38 +108,6 @@  static void asid_flush_cpu_ctxt(void)
 	local_flush_tlb_all();
 }
 
-/*
- * Initialize the ASID allocator
- *
- * @info: Pointer to the asid allocator structure
- * @bits: Number of ASIDs available
- * @asid_per_ctxt: Number of ASIDs to allocate per-context. ASIDs are
- * allocated contiguously for a given context. This value should be a power of
- * 2.
- */
-static int asid_allocator_init(struct asid_info *info,
-			       u32 bits, unsigned int asid_per_ctxt,
-			       void (*flush_cpu_ctxt_cb)(void))
-{
-	info->bits = bits;
-	info->ctxt_shift = ilog2(asid_per_ctxt);
-	info->flush_cpu_ctxt_cb = flush_cpu_ctxt_cb;
-	/*
-	 * Expect allocation after rollover to fail if we don't have at least
-	 * one more ASID than CPUs. ASID #0 is always reserved.
-	 */
-	WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
-	atomic64_set(&info->generation, ASID_FIRST_VERSION(info));
-	info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
-			    sizeof(*info->map), GFP_KERNEL);
-	if (!info->map)
-		return -ENOMEM;
-
-	raw_spin_lock_init(&info->lock);
-
-	return 0;
-}
-
 static int asids_init(void)
 {
 	u32 bits = get_cpu_asid_bits();
@@ -344,7 +115,7 @@  static int asids_init(void)
 	if (!asid_allocator_init(&asid_info, bits, ASID_PER_CONTEXT,
 				 asid_flush_cpu_ctxt))
 		panic("Unable to initialize ASID allocator for %lu ASIDs\n",
-		      1UL << bits);
+		      NUM_ASIDS(&asid_info));
 
 	asid_info.active = &active_asids;
 	asid_info.reserved = &reserved_asids;