diff mbox

[PATCH-v3,1/4] idr: Percpu ida

Message ID 1376694549-20609-2-git-send-email-nab@linux-iscsi.org (mailing list archive)
State New, archived
Headers show

Commit Message

Nicholas A. Bellinger Aug. 16, 2013, 11:09 p.m. UTC
From: Kent Overstreet <kmo@daterainc.com>

Percpu frontend for allocating ids. With percpu allocation (that works),
it's impossible to guarantee it will always be possible to allocate all
nr_tags - typically, some will be stuck on a remote percpu freelist
where the current job can't get to them.

We do guarantee that it will always be possible to allocate at least
(nr_tags / 2) tags - this is done by keeping track of which and how many
cpus have tags on their percpu freelists. On allocation failure if
enough cpus have tags that there could potentially be (nr_tags / 2) tags
stuck on remote percpu freelists, we then pick a remote cpu at random to
steal from.

Note that there's no cpu hotplug notifier - we don't care, because
steal_tags() will eventually get the down cpu's tags. We _could_ satisfy
more allocations if we had a notifier - but we'll still meet our
guarantees and it's absolutely not a correctness issue, so I don't think
it's worth the extra code.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/linux/idr.h |   53 +++++++++
 lib/idr.c           |  316 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 361 insertions(+), 8 deletions(-)

Comments

Andrew Morton Aug. 20, 2013, 9:31 p.m. UTC | #1
On Fri, 16 Aug 2013 23:09:06 +0000 "Nicholas A. Bellinger" <nab@linux-iscsi.org> wrote:

> From: Kent Overstreet <kmo@daterainc.com>
> 
> Percpu frontend for allocating ids. With percpu allocation (that works),
> it's impossible to guarantee it will always be possible to allocate all
> nr_tags - typically, some will be stuck on a remote percpu freelist
> where the current job can't get to them.
> 
> We do guarantee that it will always be possible to allocate at least
> (nr_tags / 2) tags - this is done by keeping track of which and how many
> cpus have tags on their percpu freelists. On allocation failure if
> enough cpus have tags that there could potentially be (nr_tags / 2) tags
> stuck on remote percpu freelists, we then pick a remote cpu at random to
> steal from.
> 
> Note that there's no cpu hotplug notifier - we don't care, because
> steal_tags() will eventually get the down cpu's tags. We _could_ satisfy
> more allocations if we had a notifier - but we'll still meet our
> guarantees and it's absolutely not a correctness issue, so I don't think
> it's worth the extra code.
>
> ...
>
>  include/linux/idr.h |   53 +++++++++
>  lib/idr.c           |  316 +++++++++++++++++++++++++++++++++++++++++++++++++--

I don't think this should be in idr.[ch] at all.  It has no
relationship with the existing code.  Apart from duplicating its
functionality :(

> 
> ...
>
> @@ -243,4 +245,55 @@ static inline int ida_get_new(struct ida *ida, int *p_id)
>  
>  void __init idr_init_cache(void);
>  
> +/* Percpu IDA/tag allocator */
> +
> +struct percpu_ida_cpu;
> +
> +struct percpu_ida {
> +	/*
> +	 * number of tags available to be allocated, as passed to
> +	 * percpu_ida_init()
> +	 */
> +	unsigned			nr_tags;
> +
> +	struct percpu_ida_cpu __percpu	*tag_cpu;
> +
> +	/*
> +	 * Bitmap of cpus that (may) have tags on their percpu freelists:
> +	 * steal_tags() uses this to decide when to steal tags, and which cpus
> +	 * to try stealing from.
> +	 *
> +	 * It's ok for a freelist to be empty when its bit is set - steal_tags()
> +	 * will just keep looking - but the bitmap _must_ be set whenever a
> +	 * percpu freelist does have tags.
> +	 */
> +	unsigned long			*cpus_have_tags;

Why not cpumask_t?

> +	struct {
> +		spinlock_t		lock;
> +		/*
> +		 * When we go to steal tags from another cpu (see steal_tags()),
> +		 * we want to pick a cpu at random. Cycling through them every
> +		 * time we steal is a bit easier and more or less equivalent:
> +		 */
> +		unsigned		cpu_last_stolen;
> +
> +		/* For sleeping on allocation failure */
> +		wait_queue_head_t	wait;
> +
> +		/*
> +		 * Global freelist - it's a stack where nr_free points to the
> +		 * top
> +		 */
> +		unsigned		nr_free;
> +		unsigned		*freelist;
> +	} ____cacheline_aligned_in_smp;

Why the ____cacheline_aligned_in_smp?

> +};
> 
> ...
>
> +
> +/* Percpu IDA */
> +
> +/*
> + * Number of tags we move between the percpu freelist and the global freelist at
> + * a time

"between a percpu freelist" would be more accurate?

> + */
> +#define IDA_PCPU_BATCH_MOVE	32U
> +
> +/* Max size of percpu freelist, */
> +#define IDA_PCPU_SIZE		((IDA_PCPU_BATCH_MOVE * 3) / 2)
> +
> +struct percpu_ida_cpu {
> +	spinlock_t			lock;
> +	unsigned			nr_free;
> +	unsigned			freelist[];
> +};

Data structure needs documentation.  There's one of these per cpu.  I
guess nr_free and freelist are clear enough.  The presence of a lock
in a percpu data structure is a surprise.  It's for cross-cpu stealing,
I assume?

> +static inline void move_tags(unsigned *dst, unsigned *dst_nr,
> +			     unsigned *src, unsigned *src_nr,
> +			     unsigned nr)
> +{
> +	*src_nr -= nr;
> +	memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
> +	*dst_nr += nr;
> +}
> +
> 
> ...
>
> +static inline void alloc_global_tags(struct percpu_ida *pool,
> +				     struct percpu_ida_cpu *tags)
> +{
> +	move_tags(tags->freelist, &tags->nr_free,
> +		  pool->freelist, &pool->nr_free,
> +		  min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
> +}

Document this function?

> +static inline unsigned alloc_local_tag(struct percpu_ida *pool,
> +				       struct percpu_ida_cpu *tags)
> +{
> +	int tag = -ENOSPC;
> +
> +	spin_lock(&tags->lock);
> +	if (tags->nr_free)
> +		tag = tags->freelist[--tags->nr_free];
> +	spin_unlock(&tags->lock);
> +
> +	return tag;
> +}

I guess this one's clear enough, if the data structure relationships are
understood.

> +/**
> + * percpu_ida_alloc - allocate a tag
> + * @pool: pool to allocate from
> + * @gfp: gfp flags
> + *
> + * Returns a tag - an integer in the range [0..nr_tags) (passed to
> + * tag_pool_init()), or otherwise -ENOSPC on allocation failure.
> + *
> + * Safe to be called from interrupt context (assuming it isn't passed
> + * __GFP_WAIT, of course).
> + *
> + * Will not fail if passed __GFP_WAIT.
> + */
> +int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp)
> +{
> +	DEFINE_WAIT(wait);
> +	struct percpu_ida_cpu *tags;
> +	unsigned long flags;
> +	int tag;
> +
> +	local_irq_save(flags);
> +	tags = this_cpu_ptr(pool->tag_cpu);
> +
> +	/* Fastpath */
> +	tag = alloc_local_tag(pool, tags);
> +	if (likely(tag >= 0)) {
> +		local_irq_restore(flags);
> +		return tag;
> +	}
> +
> +	while (1) {
> +		spin_lock(&pool->lock);
> +
> +		/*
> +		 * prepare_to_wait() must come before steal_tags(), in case
> +		 * percpu_ida_free() on another cpu flips a bit in
> +		 * cpus_have_tags
> +		 *
> +		 * global lock held and irqs disabled, don't need percpu lock
> +		 */
> +		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
> +
> +		if (!tags->nr_free)
> +			alloc_global_tags(pool, tags);
> +		if (!tags->nr_free)
> +			steal_tags(pool, tags);
> +
> +		if (tags->nr_free) {
> +			tag = tags->freelist[--tags->nr_free];
> +			if (tags->nr_free)
> +				set_bit(smp_processor_id(),
> +					pool->cpus_have_tags);
> +		}
> +
> +		spin_unlock(&pool->lock);
> +		local_irq_restore(flags);
> +
> +		if (tag >= 0 || !(gfp & __GFP_WAIT))
> +			break;
> +
> +		schedule();
> +
> +		local_irq_save(flags);
> +		tags = this_cpu_ptr(pool->tag_cpu);
> +	}

What guarantees that this wait will terminate?

> +	finish_wait(&pool->wait, &wait);
> +	return tag;
> +}
> +EXPORT_SYMBOL_GPL(percpu_ida_alloc);
> +
> +/**
> + * percpu_ida_free - free a tag
> + * @pool: pool @tag was allocated from
> + * @tag: a tag previously allocated with percpu_ida_alloc()
> + *
> + * Safe to be called from interrupt context.
> + */
> +void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
> +{
> +	struct percpu_ida_cpu *tags;
> +	unsigned long flags;
> +	unsigned nr_free;
> +
> +	BUG_ON(tag >= pool->nr_tags);
> +
> +	local_irq_save(flags);
> +	tags = this_cpu_ptr(pool->tag_cpu);
> +
> +	spin_lock(&tags->lock);

Why do we need this lock, btw?  It's a cpu-local structure and local
irqs are disabled...

> +	tags->freelist[tags->nr_free++] = tag;
> +
> +	nr_free = tags->nr_free;
> +	spin_unlock(&tags->lock);
> +
> +	if (nr_free == 1) {
> +		set_bit(smp_processor_id(),
> +			pool->cpus_have_tags);
> +		wake_up(&pool->wait);
> +	}
> +
> +	if (nr_free == IDA_PCPU_SIZE) {
> +		spin_lock(&pool->lock);
> +
> +		/*
> +		 * Global lock held and irqs disabled, don't need percpu
> +		 * lock
> +		 */
> +		if (tags->nr_free == IDA_PCPU_SIZE) {
> +			move_tags(pool->freelist, &pool->nr_free,
> +				  tags->freelist, &tags->nr_free,
> +				  IDA_PCPU_BATCH_MOVE);
> +
> +			wake_up(&pool->wait);
> +		}
> +		spin_unlock(&pool->lock);
> +	}
> +
> +	local_irq_restore(flags);
> +}
> +EXPORT_SYMBOL_GPL(percpu_ida_free);
> 
> ...
>
> +int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
> +{
> +	unsigned i, cpu, order;
> +
> +	memset(pool, 0, sizeof(*pool));
> +
> +	init_waitqueue_head(&pool->wait);
> +	spin_lock_init(&pool->lock);
> +	pool->nr_tags = nr_tags;
> +
> +	/* Guard against overflow */
> +	if (nr_tags > (unsigned) INT_MAX + 1) {
> +		pr_err("tags.c: nr_tags too large\n");

"tags.c"?

> +		return -EINVAL;
> +	}
> +
> +	order = get_order(nr_tags * sizeof(unsigned));
> +	pool->freelist = (void *) __get_free_pages(GFP_KERNEL, order);
> +	if (!pool->freelist)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < nr_tags; i++)
> +		pool->freelist[i] = i;
> +
> +	pool->nr_free = nr_tags;
> +
> +	pool->cpus_have_tags = kzalloc(BITS_TO_LONGS(nr_cpu_ids) *
> +				       sizeof(unsigned long), GFP_KERNEL);
> +	if (!pool->cpus_have_tags)
> +		goto err;
> +
> +	pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
> +				       IDA_PCPU_SIZE * sizeof(unsigned),
> +				       sizeof(unsigned));
> +	if (!pool->tag_cpu)
> +		goto err;
> +
> +	for_each_possible_cpu(cpu)
> +		spin_lock_init(&per_cpu_ptr(pool->tag_cpu, cpu)->lock);
> +
> +	return 0;
> +err:
> +	percpu_ida_destroy(pool);
> +	return -ENOMEM;
> +}
> +EXPORT_SYMBOL_GPL(percpu_ida_init);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Lameter (Ampere) Aug. 21, 2013, 6:25 p.m. UTC | #2
On Fri, 16 Aug 2013, Nicholas A. Bellinger wrote:

> +	spinlock_t			lock;

Remove the spinlock.

> +	unsigned			nr_free;
> +	unsigned			freelist[];
> +};
> +
> +static inline void move_tags(unsigned *dst, unsigned *dst_nr,
> +			     unsigned *src, unsigned *src_nr,
> +			     unsigned nr)
> +{
> +	*src_nr -= nr;
> +	memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
> +	*dst_nr += nr;
> +}
> +

> +static inline unsigned alloc_local_tag(struct percpu_ida *pool,
> +				       struct percpu_ida_cpu *tags)

Pass the __percpu offset and not the tags pointer.

> +{
> +	int tag = -ENOSPC;
> +
> +	spin_lock(&tags->lock);

Interupts are already disabled. Drop the spinlock.

> +	if (tags->nr_free)
> +		tag = tags->freelist[--tags->nr_free];

You can keep this or avoid address calculation through segment prefixes.
F.e.

if (__this_cpu_read(tags->nrfree) {
	int n = __this_cpu_dec_return(tags->nr_free);
	tag =  __this_cpu_read(tags->freelist[n]);
}

> +	spin_unlock(&tags->lock);

Drop.

> + * Returns a tag - an integer in the range [0..nr_tags) (passed to
> + * tag_pool_init()), or otherwise -ENOSPC on allocation failure.
> + *
> + * Safe to be called from interrupt context (assuming it isn't passed
> + * __GFP_WAIT, of course).
> + *
> + * Will not fail if passed __GFP_WAIT.
> + */
> +int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp)
> +{
> +	DEFINE_WAIT(wait);
> +	struct percpu_ida_cpu *tags;
> +	unsigned long flags;
> +	int tag;
> +
> +	local_irq_save(flags);
> +	tags = this_cpu_ptr(pool->tag_cpu);

You could drop this_cpu_ptr if you pass pool->tag_cpu to alloc_local_tag.

> +/**
> + * percpu_ida_free - free a tag
> + * @pool: pool @tag was allocated from
> + * @tag: a tag previously allocated with percpu_ida_alloc()
> + *
> + * Safe to be called from interrupt context.
> + */
> +void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
> +{
> +	struct percpu_ida_cpu *tags;
> +	unsigned long flags;
> +	unsigned nr_free;
> +
> +	BUG_ON(tag >= pool->nr_tags);
> +
> +	local_irq_save(flags);
> +	tags = this_cpu_ptr(pool->tag_cpu);
> +
> +	spin_lock(&tags->lock);

No need for spinlocking
> +	tags->freelist[tags->nr_free++] = tag;

nr_free = __this_cpu_inc_return(pool->tag_cpu.nr_free) ?

__this_cpu_write(pool->tag_cpu.freelist[nr_free], tag)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kent Overstreet Aug. 26, 2013, 8:14 p.m. UTC | #3
On Tue, Aug 20, 2013 at 02:31:57PM -0700, Andrew Morton wrote:
> On Fri, 16 Aug 2013 23:09:06 +0000 "Nicholas A. Bellinger" <nab@linux-iscsi.org> wrote:
> 
> > From: Kent Overstreet <kmo@daterainc.com>
> > 
> > Percpu frontend for allocating ids. With percpu allocation (that works),
> > it's impossible to guarantee it will always be possible to allocate all
> > nr_tags - typically, some will be stuck on a remote percpu freelist
> > where the current job can't get to them.
> > 
> > We do guarantee that it will always be possible to allocate at least
> > (nr_tags / 2) tags - this is done by keeping track of which and how many
> > cpus have tags on their percpu freelists. On allocation failure if
> > enough cpus have tags that there could potentially be (nr_tags / 2) tags
> > stuck on remote percpu freelists, we then pick a remote cpu at random to
> > steal from.
> > 
> > Note that there's no cpu hotplug notifier - we don't care, because
> > steal_tags() will eventually get the down cpu's tags. We _could_ satisfy
> > more allocations if we had a notifier - but we'll still meet our
> > guarantees and it's absolutely not a correctness issue, so I don't think
> > it's worth the extra code.
> >
> > ...
> >
> >  include/linux/idr.h |   53 +++++++++
> >  lib/idr.c           |  316 +++++++++++++++++++++++++++++++++++++++++++++++++--
> 
> I don't think this should be in idr.[ch] at all.  It has no
> relationship with the existing code.  Apart from duplicating its
> functionality :(

Well, in the full patch series it does make use of the non-percpu ida.
I'm still hoping to get the ida/idr rewrites in.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kent Overstreet Aug. 26, 2013, 8:23 p.m. UTC | #4
On Wed, Aug 21, 2013 at 06:25:58PM +0000, Christoph Lameter wrote:
> On Fri, 16 Aug 2013, Nicholas A. Bellinger wrote:
> 
> > +	spinlock_t			lock;
> 
> Remove the spinlock.

As Andrew noted, the spinlock is needed because of tag stealing. (You
don't think I'd stick a spinlock on a percpu data structure without a
real reason, would you?)

> > +	unsigned			nr_free;
> > +	unsigned			freelist[];
> > +};
> > +
> > +static inline void move_tags(unsigned *dst, unsigned *dst_nr,
> > +			     unsigned *src, unsigned *src_nr,
> > +			     unsigned nr)
> > +{
> > +	*src_nr -= nr;
> > +	memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
> > +	*dst_nr += nr;
> > +}
> > +
> 
> > +static inline unsigned alloc_local_tag(struct percpu_ida *pool,
> > +				       struct percpu_ida_cpu *tags)
> 
> Pass the __percpu offset and not the tags pointer.

Why? It just changes where the this_cpu_ptr

> 
> > +{
> > +	int tag = -ENOSPC;
> > +
> > +	spin_lock(&tags->lock);
> 
> Interupts are already disabled. Drop the spinlock.
> 
> > +	if (tags->nr_free)
> > +		tag = tags->freelist[--tags->nr_free];
> 
> You can keep this or avoid address calculation through segment prefixes.
> F.e.
> 
> if (__this_cpu_read(tags->nrfree) {
> 	int n = __this_cpu_dec_return(tags->nr_free);
> 	tag =  __this_cpu_read(tags->freelist[n]);
> }

Can you explain what the point of that change would be? It sounds like
it's preferable to do it that way and avoid this_cpu_ptr() for some
reason, but you're not explaining why.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kent Overstreet Aug. 28, 2013, 7:53 p.m. UTC | #5
On Tue, Aug 20, 2013 at 02:31:57PM -0700, Andrew Morton wrote:
> On Fri, 16 Aug 2013 23:09:06 +0000 "Nicholas A. Bellinger" <nab@linux-iscsi.org> wrote:
> > +	/*
> > +	 * Bitmap of cpus that (may) have tags on their percpu freelists:
> > +	 * steal_tags() uses this to decide when to steal tags, and which cpus
> > +	 * to try stealing from.
> > +	 *
> > +	 * It's ok for a freelist to be empty when its bit is set - steal_tags()
> > +	 * will just keep looking - but the bitmap _must_ be set whenever a
> > +	 * percpu freelist does have tags.
> > +	 */
> > +	unsigned long			*cpus_have_tags;
> 
> Why not cpumask_t?

I hadn't encountered it before - looks like it's probably what I want.

I don't see any explanation for the parallel set of operations for
working on cpumasks - e.g. next_cpu()/cpumask_next(). For now I'm going
with the cpumask_* versions, is that what I want?o

If you can have a look at the fixup patch that'll be most appreciated.

> > +	struct {
> > +		spinlock_t		lock;
> > +		/*
> > +		 * When we go to steal tags from another cpu (see steal_tags()),
> > +		 * we want to pick a cpu at random. Cycling through them every
> > +		 * time we steal is a bit easier and more or less equivalent:
> > +		 */
> > +		unsigned		cpu_last_stolen;
> > +
> > +		/* For sleeping on allocation failure */
> > +		wait_queue_head_t	wait;
> > +
> > +		/*
> > +		 * Global freelist - it's a stack where nr_free points to the
> > +		 * top
> > +		 */
> > +		unsigned		nr_free;
> > +		unsigned		*freelist;
> > +	} ____cacheline_aligned_in_smp;
> 
> Why the ____cacheline_aligned_in_smp?

It's separating the RW stuff that isn't always touched from the RO stuff
that's used on every allocation.

> 
> > +};
> > 
> > ...
> >
> > +
> > +/* Percpu IDA */
> > +
> > +/*
> > + * Number of tags we move between the percpu freelist and the global freelist at
> > + * a time
> 
> "between a percpu freelist" would be more accurate?

No, because when we're stealing tags we always grab all of the remote
percpu freelist's tags - IDA_PCPU_BATCH_MOVE is only used when moving
to/from the global freelist.

> 
> > + */
> > +#define IDA_PCPU_BATCH_MOVE	32U
> > +
> > +/* Max size of percpu freelist, */
> > +#define IDA_PCPU_SIZE		((IDA_PCPU_BATCH_MOVE * 3) / 2)
> > +
> > +struct percpu_ida_cpu {
> > +	spinlock_t			lock;
> > +	unsigned			nr_free;
> > +	unsigned			freelist[];
> > +};
> 
> Data structure needs documentation.  There's one of these per cpu.  I
> guess nr_free and freelist are clear enough.  The presence of a lock
> in a percpu data structure is a surprise.  It's for cross-cpu stealing,
> I assume?

Yeah, I'll add some comments.

> > +static inline void alloc_global_tags(struct percpu_ida *pool,
> > +				     struct percpu_ida_cpu *tags)
> > +{
> > +	move_tags(tags->freelist, &tags->nr_free,
> > +		  pool->freelist, &pool->nr_free,
> > +		  min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
> > +}
> 
> Document this function?

Will do

> > +	while (1) {
> > +		spin_lock(&pool->lock);
> > +
> > +		/*
> > +		 * prepare_to_wait() must come before steal_tags(), in case
> > +		 * percpu_ida_free() on another cpu flips a bit in
> > +		 * cpus_have_tags
> > +		 *
> > +		 * global lock held and irqs disabled, don't need percpu lock
> > +		 */
> > +		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
> > +
> > +		if (!tags->nr_free)
> > +			alloc_global_tags(pool, tags);
> > +		if (!tags->nr_free)
> > +			steal_tags(pool, tags);
> > +
> > +		if (tags->nr_free) {
> > +			tag = tags->freelist[--tags->nr_free];
> > +			if (tags->nr_free)
> > +				set_bit(smp_processor_id(),
> > +					pool->cpus_have_tags);
> > +		}
> > +
> > +		spin_unlock(&pool->lock);
> > +		local_irq_restore(flags);
> > +
> > +		if (tag >= 0 || !(gfp & __GFP_WAIT))
> > +			break;
> > +
> > +		schedule();
> > +
> > +		local_irq_save(flags);
> > +		tags = this_cpu_ptr(pool->tag_cpu);
> > +	}
> 
> What guarantees that this wait will terminate?

It seems fairly clear to me from the break statement a couple lines up;
if we were passed __GFP_WAIT we terminate iff we succesfully allocated a
tag. If we weren't passed __GFP_WAIT we never actually sleep.

I can add a comment if you think it needs one.

> > +	finish_wait(&pool->wait, &wait);
> > +	return tag;
> > +}
> > +EXPORT_SYMBOL_GPL(percpu_ida_alloc);
> > +
> > +/**
> > + * percpu_ida_free - free a tag
> > + * @pool: pool @tag was allocated from
> > + * @tag: a tag previously allocated with percpu_ida_alloc()
> > + *
> > + * Safe to be called from interrupt context.
> > + */
> > +void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
> > +{
> > +	struct percpu_ida_cpu *tags;
> > +	unsigned long flags;
> > +	unsigned nr_free;
> > +
> > +	BUG_ON(tag >= pool->nr_tags);
> > +
> > +	local_irq_save(flags);
> > +	tags = this_cpu_ptr(pool->tag_cpu);
> > +
> > +	spin_lock(&tags->lock);
> 
> Why do we need this lock, btw?  It's a cpu-local structure and local
> irqs are disabled...

Tag stealing. I added a comment for the data structure explaining the
lock, do you think that suffices?

> > +	/* Guard against overflow */
> > +	if (nr_tags > (unsigned) INT_MAX + 1) {
> > +		pr_err("tags.c: nr_tags too large\n");
> 
> "tags.c"?

Whoops, out of date.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Morton Aug. 28, 2013, 8:23 p.m. UTC | #6
On Wed, 28 Aug 2013 12:53:17 -0700 Kent Overstreet <kmo@daterainc.com> wrote:

> > > +	while (1) {
> > > +		spin_lock(&pool->lock);
> > > +
> > > +		/*
> > > +		 * prepare_to_wait() must come before steal_tags(), in case
> > > +		 * percpu_ida_free() on another cpu flips a bit in
> > > +		 * cpus_have_tags
> > > +		 *
> > > +		 * global lock held and irqs disabled, don't need percpu lock
> > > +		 */
> > > +		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
> > > +
> > > +		if (!tags->nr_free)
> > > +			alloc_global_tags(pool, tags);
> > > +		if (!tags->nr_free)
> > > +			steal_tags(pool, tags);
> > > +
> > > +		if (tags->nr_free) {
> > > +			tag = tags->freelist[--tags->nr_free];
> > > +			if (tags->nr_free)
> > > +				set_bit(smp_processor_id(),
> > > +					pool->cpus_have_tags);
> > > +		}
> > > +
> > > +		spin_unlock(&pool->lock);
> > > +		local_irq_restore(flags);
> > > +
> > > +		if (tag >= 0 || !(gfp & __GFP_WAIT))
> > > +			break;
> > > +
> > > +		schedule();
> > > +
> > > +		local_irq_save(flags);
> > > +		tags = this_cpu_ptr(pool->tag_cpu);
> > > +	}
> > 
> > What guarantees that this wait will terminate?
> 
> It seems fairly clear to me from the break statement a couple lines up;
> if we were passed __GFP_WAIT we terminate iff we succesfully allocated a
> tag. If we weren't passed __GFP_WAIT we never actually sleep.

OK ;)  Let me rephrase.  What guarantees that a tag will become available?

If what we have here is an open-coded __GFP_NOFAIL then that is
potentially problematic.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kent Overstreet Aug. 28, 2013, 8:44 p.m. UTC | #7
On Wed, Aug 28, 2013 at 01:23:32PM -0700, Andrew Morton wrote:
> On Wed, 28 Aug 2013 12:53:17 -0700 Kent Overstreet <kmo@daterainc.com> wrote:
> 
> > > > +	while (1) {
> > > > +		spin_lock(&pool->lock);
> > > > +
> > > > +		/*
> > > > +		 * prepare_to_wait() must come before steal_tags(), in case
> > > > +		 * percpu_ida_free() on another cpu flips a bit in
> > > > +		 * cpus_have_tags
> > > > +		 *
> > > > +		 * global lock held and irqs disabled, don't need percpu lock
> > > > +		 */
> > > > +		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
> > > > +
> > > > +		if (!tags->nr_free)
> > > > +			alloc_global_tags(pool, tags);
> > > > +		if (!tags->nr_free)
> > > > +			steal_tags(pool, tags);
> > > > +
> > > > +		if (tags->nr_free) {
> > > > +			tag = tags->freelist[--tags->nr_free];
> > > > +			if (tags->nr_free)
> > > > +				set_bit(smp_processor_id(),
> > > > +					pool->cpus_have_tags);
> > > > +		}
> > > > +
> > > > +		spin_unlock(&pool->lock);
> > > > +		local_irq_restore(flags);
> > > > +
> > > > +		if (tag >= 0 || !(gfp & __GFP_WAIT))
> > > > +			break;
> > > > +
> > > > +		schedule();
> > > > +
> > > > +		local_irq_save(flags);
> > > > +		tags = this_cpu_ptr(pool->tag_cpu);
> > > > +	}
> > > 
> > > What guarantees that this wait will terminate?
> > 
> > It seems fairly clear to me from the break statement a couple lines up;
> > if we were passed __GFP_WAIT we terminate iff we succesfully allocated a
> > tag. If we weren't passed __GFP_WAIT we never actually sleep.
> 
> OK ;)  Let me rephrase.  What guarantees that a tag will become available?
> 
> If what we have here is an open-coded __GFP_NOFAIL then that is
> potentially problematic.

It's the same semantics as a mempool, really - it'll succeed when a tag
gets freed.  If we are sleeping then there isn't really anything else we
can do, there isn't anything we're trying in the __GFP_WAIT case that
we're not trying in the GFP_NOWAIT case.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Morton Aug. 28, 2013, 8:50 p.m. UTC | #8
On Wed, 28 Aug 2013 13:44:54 -0700 Kent Overstreet <kmo@daterainc.com> wrote:

> > > > What guarantees that this wait will terminate?
> > > 
> > > It seems fairly clear to me from the break statement a couple lines up;
> > > if we were passed __GFP_WAIT we terminate iff we succesfully allocated a
> > > tag. If we weren't passed __GFP_WAIT we never actually sleep.
> > 
> > OK ;)  Let me rephrase.  What guarantees that a tag will become available?
> > 
> > If what we have here is an open-coded __GFP_NOFAIL then that is
> > potentially problematic.
> 
> It's the same semantics as a mempool, really - it'll succeed when a tag
> gets freed.

OK, that's reasonable if the code is being used to generate IO tags -
we expect the in-flight tags to eventually be returned.

But if a client of this code is using the allocator for something
totally different, there is no guarantee that the act of waiting will
result in any tags being returned.

(These are core design principles/constraints which should be
explicitly documented in a place where future readers will see them!)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 871a213..f0db12b 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -16,6 +16,8 @@ 
 #include <linux/bitops.h>
 #include <linux/init.h>
 #include <linux/rcupdate.h>
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>
 
 /*
  * We want shallower trees and thus more bits covered at each layer.  8
@@ -243,4 +245,55 @@  static inline int ida_get_new(struct ida *ida, int *p_id)
 
 void __init idr_init_cache(void);
 
+/* Percpu IDA/tag allocator */
+
+struct percpu_ida_cpu;
+
+struct percpu_ida {
+	/*
+	 * number of tags available to be allocated, as passed to
+	 * percpu_ida_init()
+	 */
+	unsigned			nr_tags;
+
+	struct percpu_ida_cpu __percpu	*tag_cpu;
+
+	/*
+	 * Bitmap of cpus that (may) have tags on their percpu freelists:
+	 * steal_tags() uses this to decide when to steal tags, and which cpus
+	 * to try stealing from.
+	 *
+	 * It's ok for a freelist to be empty when its bit is set - steal_tags()
+	 * will just keep looking - but the bitmap _must_ be set whenever a
+	 * percpu freelist does have tags.
+	 */
+	unsigned long			*cpus_have_tags;
+
+	struct {
+		spinlock_t		lock;
+		/*
+		 * When we go to steal tags from another cpu (see steal_tags()),
+		 * we want to pick a cpu at random. Cycling through them every
+		 * time we steal is a bit easier and more or less equivalent:
+		 */
+		unsigned		cpu_last_stolen;
+
+		/* For sleeping on allocation failure */
+		wait_queue_head_t	wait;
+
+		/*
+		 * Global freelist - it's a stack where nr_free points to the
+		 * top
+		 */
+		unsigned		nr_free;
+		unsigned		*freelist;
+	} ____cacheline_aligned_in_smp;
+};
+
+int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp);
+void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
+
+void percpu_ida_destroy(struct percpu_ida *pool);
+int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags);
+
 #endif /* __IDR_H__ */
diff --git a/lib/idr.c b/lib/idr.c
index bfe4db4..57bfabe 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -26,17 +26,20 @@ 
  * with the slab allocator.
  */
 
-#ifndef TEST                        // to test in user space...
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#endif
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
 #include <linux/err.h>
-#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
 #include <linux/idr.h>
-#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/percpu.h>
-#include <linux/hardirq.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
 
 #define MAX_IDR_SHIFT		(sizeof(int) * 8 - 1)
 #define MAX_IDR_BIT		(1U << MAX_IDR_SHIFT)
@@ -1159,3 +1162,300 @@  void ida_init(struct ida *ida)
 
 }
 EXPORT_SYMBOL(ida_init);
+
+/* Percpu IDA */
+
+/*
+ * Number of tags we move between the percpu freelist and the global freelist at
+ * a time
+ */
+#define IDA_PCPU_BATCH_MOVE	32U
+
+/* Max size of percpu freelist, */
+#define IDA_PCPU_SIZE		((IDA_PCPU_BATCH_MOVE * 3) / 2)
+
+struct percpu_ida_cpu {
+	spinlock_t			lock;
+	unsigned			nr_free;
+	unsigned			freelist[];
+};
+
+static inline void move_tags(unsigned *dst, unsigned *dst_nr,
+			     unsigned *src, unsigned *src_nr,
+			     unsigned nr)
+{
+	*src_nr -= nr;
+	memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
+	*dst_nr += nr;
+}
+
+/*
+ * Try to steal tags from a remote cpu's percpu freelist.
+ *
+ * We first check how many percpu freelists have tags - we don't steal tags
+ * unless enough percpu freelists have tags on them that it's possible more than
+ * half the total tags could be stuck on remote percpu freelists.
+ *
+ * Then we iterate through the cpus until we find some tags - we don't attempt
+ * to find the "best" cpu to steal from, to keep cacheline bouncing to a
+ * minimum.
+ */
+static inline void steal_tags(struct percpu_ida *pool,
+			      struct percpu_ida_cpu *tags)
+{
+	unsigned cpus_have_tags, cpu = pool->cpu_last_stolen;
+	struct percpu_ida_cpu *remote;
+
+	for (cpus_have_tags = bitmap_weight(pool->cpus_have_tags, nr_cpu_ids);
+	     cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;
+	     cpus_have_tags--) {
+		cpu = find_next_bit(pool->cpus_have_tags, nr_cpu_ids, cpu);
+
+		if (cpu == nr_cpu_ids)
+			cpu = find_first_bit(pool->cpus_have_tags, nr_cpu_ids);
+
+		if (cpu == nr_cpu_ids)
+			BUG();
+
+		pool->cpu_last_stolen = cpu;
+		remote = per_cpu_ptr(pool->tag_cpu, cpu);
+
+		clear_bit(cpu, pool->cpus_have_tags);
+
+		if (remote == tags)
+			continue;
+
+		spin_lock(&remote->lock);
+
+		if (remote->nr_free) {
+			memcpy(tags->freelist,
+			       remote->freelist,
+			       sizeof(unsigned) * remote->nr_free);
+
+			tags->nr_free = remote->nr_free;
+			remote->nr_free = 0;
+		}
+
+		spin_unlock(&remote->lock);
+
+		if (tags->nr_free)
+			break;
+	}
+}
+
+static inline void alloc_global_tags(struct percpu_ida *pool,
+				     struct percpu_ida_cpu *tags)
+{
+	move_tags(tags->freelist, &tags->nr_free,
+		  pool->freelist, &pool->nr_free,
+		  min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
+}
+
+static inline unsigned alloc_local_tag(struct percpu_ida *pool,
+				       struct percpu_ida_cpu *tags)
+{
+	int tag = -ENOSPC;
+
+	spin_lock(&tags->lock);
+	if (tags->nr_free)
+		tag = tags->freelist[--tags->nr_free];
+	spin_unlock(&tags->lock);
+
+	return tag;
+}
+
+/**
+ * percpu_ida_alloc - allocate a tag
+ * @pool: pool to allocate from
+ * @gfp: gfp flags
+ *
+ * Returns a tag - an integer in the range [0..nr_tags) (passed to
+ * tag_pool_init()), or otherwise -ENOSPC on allocation failure.
+ *
+ * Safe to be called from interrupt context (assuming it isn't passed
+ * __GFP_WAIT, of course).
+ *
+ * Will not fail if passed __GFP_WAIT.
+ */
+int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp)
+{
+	DEFINE_WAIT(wait);
+	struct percpu_ida_cpu *tags;
+	unsigned long flags;
+	int tag;
+
+	local_irq_save(flags);
+	tags = this_cpu_ptr(pool->tag_cpu);
+
+	/* Fastpath */
+	tag = alloc_local_tag(pool, tags);
+	if (likely(tag >= 0)) {
+		local_irq_restore(flags);
+		return tag;
+	}
+
+	while (1) {
+		spin_lock(&pool->lock);
+
+		/*
+		 * prepare_to_wait() must come before steal_tags(), in case
+		 * percpu_ida_free() on another cpu flips a bit in
+		 * cpus_have_tags
+		 *
+		 * global lock held and irqs disabled, don't need percpu lock
+		 */
+		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+		if (!tags->nr_free)
+			alloc_global_tags(pool, tags);
+		if (!tags->nr_free)
+			steal_tags(pool, tags);
+
+		if (tags->nr_free) {
+			tag = tags->freelist[--tags->nr_free];
+			if (tags->nr_free)
+				set_bit(smp_processor_id(),
+					pool->cpus_have_tags);
+		}
+
+		spin_unlock(&pool->lock);
+		local_irq_restore(flags);
+
+		if (tag >= 0 || !(gfp & __GFP_WAIT))
+			break;
+
+		schedule();
+
+		local_irq_save(flags);
+		tags = this_cpu_ptr(pool->tag_cpu);
+	}
+
+	finish_wait(&pool->wait, &wait);
+	return tag;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_alloc);
+
+/**
+ * percpu_ida_free - free a tag
+ * @pool: pool @tag was allocated from
+ * @tag: a tag previously allocated with percpu_ida_alloc()
+ *
+ * Safe to be called from interrupt context.
+ */
+void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
+{
+	struct percpu_ida_cpu *tags;
+	unsigned long flags;
+	unsigned nr_free;
+
+	BUG_ON(tag >= pool->nr_tags);
+
+	local_irq_save(flags);
+	tags = this_cpu_ptr(pool->tag_cpu);
+
+	spin_lock(&tags->lock);
+	tags->freelist[tags->nr_free++] = tag;
+
+	nr_free = tags->nr_free;
+	spin_unlock(&tags->lock);
+
+	if (nr_free == 1) {
+		set_bit(smp_processor_id(),
+			pool->cpus_have_tags);
+		wake_up(&pool->wait);
+	}
+
+	if (nr_free == IDA_PCPU_SIZE) {
+		spin_lock(&pool->lock);
+
+		/*
+		 * Global lock held and irqs disabled, don't need percpu
+		 * lock
+		 */
+		if (tags->nr_free == IDA_PCPU_SIZE) {
+			move_tags(pool->freelist, &pool->nr_free,
+				  tags->freelist, &tags->nr_free,
+				  IDA_PCPU_BATCH_MOVE);
+
+			wake_up(&pool->wait);
+		}
+		spin_unlock(&pool->lock);
+	}
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(percpu_ida_free);
+
+/**
+ * percpu_ida_destroy - release a tag pool's resources
+ * @pool: pool to free
+ *
+ * Frees the resources allocated by percpu_ida_init().
+ */
+void percpu_ida_destroy(struct percpu_ida *pool)
+{
+	free_percpu(pool->tag_cpu);
+	kfree(pool->cpus_have_tags);
+	free_pages((unsigned long) pool->freelist,
+		   get_order(pool->nr_tags * sizeof(unsigned)));
+}
+EXPORT_SYMBOL_GPL(percpu_ida_destroy);
+
+/**
+ * percpu_ida_init - initialize a percpu tag pool
+ * @pool: pool to initialize
+ * @nr_tags: number of tags that will be available for allocation
+ *
+ * Initializes @pool so that it can be used to allocate tags - integers in the
+ * range [0, nr_tags). Typically, they'll be used by driver code to refer to a
+ * preallocated array of tag structures.
+ *
+ * Allocation is percpu, but sharding is limited by nr_tags - for best
+ * performance, the workload should not span more cpus than nr_tags / 128.
+ */
+int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
+{
+	unsigned i, cpu, order;
+
+	memset(pool, 0, sizeof(*pool));
+
+	init_waitqueue_head(&pool->wait);
+	spin_lock_init(&pool->lock);
+	pool->nr_tags = nr_tags;
+
+	/* Guard against overflow */
+	if (nr_tags > (unsigned) INT_MAX + 1) {
+		pr_err("tags.c: nr_tags too large\n");
+		return -EINVAL;
+	}
+
+	order = get_order(nr_tags * sizeof(unsigned));
+	pool->freelist = (void *) __get_free_pages(GFP_KERNEL, order);
+	if (!pool->freelist)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_tags; i++)
+		pool->freelist[i] = i;
+
+	pool->nr_free = nr_tags;
+
+	pool->cpus_have_tags = kzalloc(BITS_TO_LONGS(nr_cpu_ids) *
+				       sizeof(unsigned long), GFP_KERNEL);
+	if (!pool->cpus_have_tags)
+		goto err;
+
+	pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
+				       IDA_PCPU_SIZE * sizeof(unsigned),
+				       sizeof(unsigned));
+	if (!pool->tag_cpu)
+		goto err;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu_ptr(pool->tag_cpu, cpu)->lock);
+
+	return 0;
+err:
+	percpu_ida_destroy(pool);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_init);