diff mbox series

[09/24] rcu/tree: cache specified number of objects

Message ID 20200428205903.61704-10-urezki@gmail.com (mailing list archive)
State New, archived
Headers show
Series Introduce kvfree_rcu(1 or 2 arguments) | expand

Commit Message

Uladzislau Rezki April 28, 2020, 8:58 p.m. UTC
Cache some extra objects per-CPU. During reclaim process
some pages are cached instead of releasing by linking them
into the list. Such approach provides O(1) access time to
the cache.

That reduces number of requests to the page allocator, also
that makes it more helpful if a low memory condition occurs.

A parameter reflecting the minimum allowed pages to be
cached per one CPU is propagated via sysfs, it is read
only, the name is "rcu_min_cached_objs".

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 4 deletions(-)

Comments

Paul E. McKenney May 1, 2020, 9:27 p.m. UTC | #1
On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote:
> Cache some extra objects per-CPU. During reclaim process
> some pages are cached instead of releasing by linking them
> into the list. Such approach provides O(1) access time to
> the cache.
> 
> That reduces number of requests to the page allocator, also
> that makes it more helpful if a low memory condition occurs.
> 
> A parameter reflecting the minimum allowed pages to be
> cached per one CPU is propagated via sysfs, it is read
> only, the name is "rcu_min_cached_objs".
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 60 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 89e9ca3f4e3e..d8975819b1c9 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444);
>  static int gp_cleanup_delay;
>  module_param(gp_cleanup_delay, int, 0444);
>  
> +/*
> + * This rcu parameter is read-only, but can be write also.

You mean that although the parameter is read-only, you see no reason
why it could not be converted to writeable?

If it was writeable, and a given CPU had the maximum numbr of cached
objects, the rcu_min_cached_objs value was decreased, but that CPU never
saw another kfree_rcu(), would the number of cached objects change?

(Just curious, not asking for a change in functionality.)

> + * It reflects the minimum allowed number of objects which
> + * can be cached per-CPU. Object size is equal to one page.
> + */
> +int rcu_min_cached_objs = 2;
> +module_param(rcu_min_cached_objs, int, 0444);
> +
>  /* Retrieve RCU kthreads priority for rcutorture */
>  int rcu_get_gp_kthreads_prio(void)
>  {
> @@ -2887,7 +2895,6 @@ struct kfree_rcu_cpu_work {
>   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
>   * @head: List of kfree_rcu() objects not yet waiting for a grace period
>   * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
> - * @bcached: Keeps at most one object for later reuse when build chain blocks
>   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
>   * @lock: Synchronize access to this structure
>   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
> @@ -2902,7 +2909,6 @@ struct kfree_rcu_cpu_work {
>  struct kfree_rcu_cpu {
>  	struct rcu_head *head;
>  	struct kfree_rcu_bulk_data *bhead;
> -	struct kfree_rcu_bulk_data *bcached;
>  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
>  	raw_spinlock_t lock;
>  	struct delayed_work monitor_work;
> @@ -2910,6 +2916,15 @@ struct kfree_rcu_cpu {
>  	bool initialized;
>  	// Number of objects for which GP not started
>  	int count;
> +
> +	/*
> +	 * Number of cached objects which are queued into
> +	 * the lock-less list. This cache is used by the
> +	 * kvfree_call_rcu() function and as of now its
> +	 * size is static.
> +	 */
> +	struct llist_head bkvcache;
> +	int nr_bkv_objs;
>  };
>  
>  static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
> @@ -2946,6 +2961,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
>  	local_irq_restore(flags);
>  }
>  
> +static inline struct kfree_rcu_bulk_data *
> +get_cached_bnode(struct kfree_rcu_cpu *krcp)
> +{
> +	if (!krcp->nr_bkv_objs)
> +		return NULL;
> +
> +	krcp->nr_bkv_objs--;
> +	return (struct kfree_rcu_bulk_data *)
> +		llist_del_first(&krcp->bkvcache);
> +}
> +
> +static inline bool
> +put_cached_bnode(struct kfree_rcu_cpu *krcp,
> +	struct kfree_rcu_bulk_data *bnode)
> +{
> +	/* Check the limit. */
> +	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
> +		return false;
> +
> +	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
> +	krcp->nr_bkv_objs++;
> +	return true;
> +
> +}
> +
>  /*
>   * This function is invoked in workqueue context after a grace period.
>   * It frees all the objects queued on ->bhead_free or ->head_free.
> @@ -2981,7 +3021,12 @@ static void kfree_rcu_work(struct work_struct *work)
>  		kfree_bulk(bhead->nr_records, bhead->records);
>  		rcu_lock_release(&rcu_callback_map);
>  
> -		if (cmpxchg(&krcp->bcached, NULL, bhead))
> +		krcp = krc_this_cpu_lock(&flags);

Presumably the list can also be accessed without holding this lock,
because otherwise we shouldn't need llist...

							Thanx, Paul

> +		if (put_cached_bnode(krcp, bhead))
> +			bhead = NULL;
> +		krc_this_cpu_unlock(krcp, flags);
> +
> +		if (bhead)
>  			free_page((unsigned long) bhead);
>  
>  		cond_resched_tasks_rcu_qs();
> @@ -3114,7 +3159,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
>  	/* Check if a new block is required. */
>  	if (!krcp->bhead ||
>  			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
> -		bnode = xchg(&krcp->bcached, NULL);
> +		bnode = get_cached_bnode(krcp);
>  		if (!bnode) {
>  			WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
>  
> @@ -4167,12 +4212,23 @@ static void __init kfree_rcu_batch_init(void)
>  
>  	for_each_possible_cpu(cpu) {
>  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> +		struct kfree_rcu_bulk_data *bnode;
>  
>  		for (i = 0; i < KFREE_N_BATCHES; i++) {
>  			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
>  			krcp->krw_arr[i].krcp = krcp;
>  		}
>  
> +		for (i = 0; i < rcu_min_cached_objs; i++) {
> +			bnode = (struct kfree_rcu_bulk_data *)
> +				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> +
> +			if (bnode)
> +				put_cached_bnode(krcp, bnode);
> +			else
> +				pr_err("Failed to preallocate for %d CPU!\n", cpu);
> +		}
> +
>  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
>  		krcp->initialized = true;
>  	}
> -- 
> 2.20.1
>
Uladzislau Rezki May 4, 2020, 12:43 p.m. UTC | #2
On Fri, May 01, 2020 at 02:27:49PM -0700, Paul E. McKenney wrote:
> On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote:
> > Cache some extra objects per-CPU. During reclaim process
> > some pages are cached instead of releasing by linking them
> > into the list. Such approach provides O(1) access time to
> > the cache.
> > 
> > That reduces number of requests to the page allocator, also
> > that makes it more helpful if a low memory condition occurs.
> > 
> > A parameter reflecting the minimum allowed pages to be
> > cached per one CPU is propagated via sysfs, it is read
> > only, the name is "rcu_min_cached_objs".
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > ---
> >  kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 60 insertions(+), 4 deletions(-)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 89e9ca3f4e3e..d8975819b1c9 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444);
> >  static int gp_cleanup_delay;
> >  module_param(gp_cleanup_delay, int, 0444);
> >  
> > +/*
> > + * This rcu parameter is read-only, but can be write also.
> 
> You mean that although the parameter is read-only, you see no reason
> why it could not be converted to writeable?
> 
I added just a note. If it is writable, then we can change the size of the
per-CPU cache dynamically, i.e. "echo 5 > /sys/.../rcu_min_cached_objs"
would cache 5 pages. But i do not have a strong opinion if it should be
writable.

>
> If it was writeable, and a given CPU had the maximum numbr of cached
> objects, the rcu_min_cached_objs value was decreased, but that CPU never
> saw another kfree_rcu(), would the number of cached objects change?
> 
No. It works the way: unqueue the page from cache in the kfree_rcu(),
whereas "rcu work" will put it back if number of objects < rcu_min_cached_objs,
if >= will free the page.

>
> (Just curious, not asking for a change in functionality.)
> 
> > + * It reflects the minimum allowed number of objects which
> > + * can be cached per-CPU. Object size is equal to one page.
> > + */
> > +int rcu_min_cached_objs = 2;
> > +module_param(rcu_min_cached_objs, int, 0444);
> > +
> >  /* Retrieve RCU kthreads priority for rcutorture */
> >  int rcu_get_gp_kthreads_prio(void)
> >  {
> > @@ -2887,7 +2895,6 @@ struct kfree_rcu_cpu_work {
> >   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> >   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> >   * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
> > - * @bcached: Keeps at most one object for later reuse when build chain blocks
> >   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> >   * @lock: Synchronize access to this structure
> >   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
> > @@ -2902,7 +2909,6 @@ struct kfree_rcu_cpu_work {
> >  struct kfree_rcu_cpu {
> >  	struct rcu_head *head;
> >  	struct kfree_rcu_bulk_data *bhead;
> > -	struct kfree_rcu_bulk_data *bcached;
> >  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> >  	raw_spinlock_t lock;
> >  	struct delayed_work monitor_work;
> > @@ -2910,6 +2916,15 @@ struct kfree_rcu_cpu {
> >  	bool initialized;
> >  	// Number of objects for which GP not started
> >  	int count;
> > +
> > +	/*
> > +	 * Number of cached objects which are queued into
> > +	 * the lock-less list. This cache is used by the
> > +	 * kvfree_call_rcu() function and as of now its
> > +	 * size is static.
> > +	 */
> > +	struct llist_head bkvcache;
> > +	int nr_bkv_objs;
> >  };
> >  
> >  static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
> > @@ -2946,6 +2961,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
> >  	local_irq_restore(flags);
> >  }
> >  
> > +static inline struct kfree_rcu_bulk_data *
> > +get_cached_bnode(struct kfree_rcu_cpu *krcp)
> > +{
> > +	if (!krcp->nr_bkv_objs)
> > +		return NULL;
> > +
> > +	krcp->nr_bkv_objs--;
> > +	return (struct kfree_rcu_bulk_data *)
> > +		llist_del_first(&krcp->bkvcache);
> > +}
> > +
> > +static inline bool
> > +put_cached_bnode(struct kfree_rcu_cpu *krcp,
> > +	struct kfree_rcu_bulk_data *bnode)
> > +{
> > +	/* Check the limit. */
> > +	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
> > +		return false;
> > +
> > +	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
> > +	krcp->nr_bkv_objs++;
> > +	return true;
> > +
> > +}
> > +
> >  /*
> >   * This function is invoked in workqueue context after a grace period.
> >   * It frees all the objects queued on ->bhead_free or ->head_free.
> > @@ -2981,7 +3021,12 @@ static void kfree_rcu_work(struct work_struct *work)
> >  		kfree_bulk(bhead->nr_records, bhead->records);
> >  		rcu_lock_release(&rcu_callback_map);
> >  
> > -		if (cmpxchg(&krcp->bcached, NULL, bhead))
> > +		krcp = krc_this_cpu_lock(&flags);
> 
> Presumably the list can also be accessed without holding this lock,
> because otherwise we shouldn't need llist...
> 
Hm... We increase the number of elements in cache, therefore it is not
lockless. From the other hand i used llist_head to maintain the cache
because it is single linked list, we do not need "*prev" link. Also
we do not need to init the list.

But i can change it to list_head. Please let me know if i need :)

--
Vlad Rezki
Paul E. McKenney May 4, 2020, 3:24 p.m. UTC | #3
On Mon, May 04, 2020 at 02:43:23PM +0200, Uladzislau Rezki wrote:
> On Fri, May 01, 2020 at 02:27:49PM -0700, Paul E. McKenney wrote:
> > On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote:
> > > Cache some extra objects per-CPU. During reclaim process
> > > some pages are cached instead of releasing by linking them
> > > into the list. Such approach provides O(1) access time to
> > > the cache.
> > > 
> > > That reduces number of requests to the page allocator, also
> > > that makes it more helpful if a low memory condition occurs.
> > > 
> > > A parameter reflecting the minimum allowed pages to be
> > > cached per one CPU is propagated via sysfs, it is read
> > > only, the name is "rcu_min_cached_objs".
> > > 
> > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > ---
> > >  kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 60 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index 89e9ca3f4e3e..d8975819b1c9 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444);
> > >  static int gp_cleanup_delay;
> > >  module_param(gp_cleanup_delay, int, 0444);
> > >  
> > > +/*
> > > + * This rcu parameter is read-only, but can be write also.
> > 
> > You mean that although the parameter is read-only, you see no reason
> > why it could not be converted to writeable?
> > 
> I added just a note. If it is writable, then we can change the size of the
> per-CPU cache dynamically, i.e. "echo 5 > /sys/.../rcu_min_cached_objs"
> would cache 5 pages. But i do not have a strong opinion if it should be
> writable.
> 
> > If it was writeable, and a given CPU had the maximum numbr of cached
> > objects, the rcu_min_cached_objs value was decreased, but that CPU never
> > saw another kfree_rcu(), would the number of cached objects change?
> > 
> No. It works the way: unqueue the page from cache in the kfree_rcu(),
> whereas "rcu work" will put it back if number of objects < rcu_min_cached_objs,
> if >= will free the page.

Just to make sure I understand...  If someone writes a smaller number to
the sysfs variable, the per-CPU caches will be decreased at that point,
immediately during that sysfs write?  Or are you saying something else?

> > (Just curious, not asking for a change in functionality.)
> > 
> > > + * It reflects the minimum allowed number of objects which
> > > + * can be cached per-CPU. Object size is equal to one page.
> > > + */
> > > +int rcu_min_cached_objs = 2;
> > > +module_param(rcu_min_cached_objs, int, 0444);
> > > +
> > >  /* Retrieve RCU kthreads priority for rcutorture */
> > >  int rcu_get_gp_kthreads_prio(void)
> > >  {
> > > @@ -2887,7 +2895,6 @@ struct kfree_rcu_cpu_work {
> > >   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
> > >   * @head: List of kfree_rcu() objects not yet waiting for a grace period
> > >   * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
> > > - * @bcached: Keeps at most one object for later reuse when build chain blocks
> > >   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
> > >   * @lock: Synchronize access to this structure
> > >   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
> > > @@ -2902,7 +2909,6 @@ struct kfree_rcu_cpu_work {
> > >  struct kfree_rcu_cpu {
> > >  	struct rcu_head *head;
> > >  	struct kfree_rcu_bulk_data *bhead;
> > > -	struct kfree_rcu_bulk_data *bcached;
> > >  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
> > >  	raw_spinlock_t lock;
> > >  	struct delayed_work monitor_work;
> > > @@ -2910,6 +2916,15 @@ struct kfree_rcu_cpu {
> > >  	bool initialized;
> > >  	// Number of objects for which GP not started
> > >  	int count;
> > > +
> > > +	/*
> > > +	 * Number of cached objects which are queued into
> > > +	 * the lock-less list. This cache is used by the
> > > +	 * kvfree_call_rcu() function and as of now its
> > > +	 * size is static.
> > > +	 */
> > > +	struct llist_head bkvcache;
> > > +	int nr_bkv_objs;
> > >  };
> > >  
> > >  static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
> > > @@ -2946,6 +2961,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
> > >  	local_irq_restore(flags);
> > >  }
> > >  
> > > +static inline struct kfree_rcu_bulk_data *
> > > +get_cached_bnode(struct kfree_rcu_cpu *krcp)
> > > +{
> > > +	if (!krcp->nr_bkv_objs)
> > > +		return NULL;
> > > +
> > > +	krcp->nr_bkv_objs--;
> > > +	return (struct kfree_rcu_bulk_data *)
> > > +		llist_del_first(&krcp->bkvcache);
> > > +}
> > > +
> > > +static inline bool
> > > +put_cached_bnode(struct kfree_rcu_cpu *krcp,
> > > +	struct kfree_rcu_bulk_data *bnode)
> > > +{
> > > +	/* Check the limit. */
> > > +	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
> > > +		return false;
> > > +
> > > +	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
> > > +	krcp->nr_bkv_objs++;
> > > +	return true;
> > > +
> > > +}
> > > +
> > >  /*
> > >   * This function is invoked in workqueue context after a grace period.
> > >   * It frees all the objects queued on ->bhead_free or ->head_free.
> > > @@ -2981,7 +3021,12 @@ static void kfree_rcu_work(struct work_struct *work)
> > >  		kfree_bulk(bhead->nr_records, bhead->records);
> > >  		rcu_lock_release(&rcu_callback_map);
> > >  
> > > -		if (cmpxchg(&krcp->bcached, NULL, bhead))
> > > +		krcp = krc_this_cpu_lock(&flags);
> > 
> > Presumably the list can also be accessed without holding this lock,
> > because otherwise we shouldn't need llist...
> > 
> Hm... We increase the number of elements in cache, therefore it is not
> lockless. From the other hand i used llist_head to maintain the cache
> because it is single linked list, we do not need "*prev" link. Also
> we do not need to init the list.
> 
> But i can change it to list_head. Please let me know if i need :)

Hmmm...  Maybe it is time for a non-atomic singly linked list?  In the RCU
callback processing, the operations were open-coded, but they have been
pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*.

Maybe some non-atomic/protected/whatever macros in the llist.h file?
Or maybe just open-code the singly linked list?  (Probably not the
best choice, though.)  Add comments stating that the atomic properties
of the llist functions aren't neded?  Something else?

The comments would be a good start.  Just to take pity on people seeing
the potential for concurrency and wondering how the concurrent accesses
actually happen.  ;-)

							Thanx, Paul
Uladzislau Rezki May 4, 2020, 5:48 p.m. UTC | #4
On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote:
> On Mon, May 04, 2020 at 02:43:23PM +0200, Uladzislau Rezki wrote:
> > On Fri, May 01, 2020 at 02:27:49PM -0700, Paul E. McKenney wrote:
> > > On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote:
> > > > Cache some extra objects per-CPU. During reclaim process
> > > > some pages are cached instead of releasing by linking them
> > > > into the list. Such approach provides O(1) access time to
> > > > the cache.
> > > > 
> > > > That reduces number of requests to the page allocator, also
> > > > that makes it more helpful if a low memory condition occurs.
> > > > 
> > > > A parameter reflecting the minimum allowed pages to be
> > > > cached per one CPU is propagated via sysfs, it is read
> > > > only, the name is "rcu_min_cached_objs".
> > > > 
> > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > ---
> > > >  kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 60 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > index 89e9ca3f4e3e..d8975819b1c9 100644
> > > > --- a/kernel/rcu/tree.c
> > > > +++ b/kernel/rcu/tree.c
> > > > @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444);
> > > >  static int gp_cleanup_delay;
> > > >  module_param(gp_cleanup_delay, int, 0444);
> > > >  
> > > > +/*
> > > > + * This rcu parameter is read-only, but can be write also.
> > > 
> > > You mean that although the parameter is read-only, you see no reason
> > > why it could not be converted to writeable?
> > > 
> > I added just a note. If it is writable, then we can change the size of the
> > per-CPU cache dynamically, i.e. "echo 5 > /sys/.../rcu_min_cached_objs"
> > would cache 5 pages. But i do not have a strong opinion if it should be
> > writable.
> > 
> > > If it was writeable, and a given CPU had the maximum numbr of cached
> > > objects, the rcu_min_cached_objs value was decreased, but that CPU never
> > > saw another kfree_rcu(), would the number of cached objects change?
> > > 
> > No. It works the way: unqueue the page from cache in the kfree_rcu(),
> > whereas "rcu work" will put it back if number of objects < rcu_min_cached_objs,
> > if >= will free the page.
> 
> Just to make sure I understand...  If someone writes a smaller number to
> the sysfs variable, the per-CPU caches will be decreased at that point,
> immediately during that sysfs write?  Or are you saying something else?
> 
This patch defines it as read-only. It defines the minimum threshold that
controls number of elements in the per-CPU cache. If we decide to make it
write also, then we will have full of freedom how to define its behavior,
i.e. it is not defined because it is read only.


> > > Presumably the list can also be accessed without holding this lock,
> > > because otherwise we shouldn't need llist...
> > > 
> > Hm... We increase the number of elements in cache, therefore it is not
> > lockless. From the other hand i used llist_head to maintain the cache
> > because it is single linked list, we do not need "*prev" link. Also
> > we do not need to init the list.
> > 
> > But i can change it to list_head. Please let me know if i need :)
> 
> Hmmm...  Maybe it is time for a non-atomic singly linked list?  In the RCU
> callback processing, the operations were open-coded, but they have been
> pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*.
> 
> Maybe some non-atomic/protected/whatever macros in the llist.h file?
> Or maybe just open-code the singly linked list?  (Probably not the
> best choice, though.)  Add comments stating that the atomic properties
> of the llist functions aren't neded?  Something else?
>
In order to keep it simple i can replace llist_head by the list_head?

> 
> The comments would be a good start.  Just to take pity on people seeing
> the potential for concurrency and wondering how the concurrent accesses
> actually happen.  ;-)
> 
Sounds like you are kidding me :) 

--
Vlad Rezki
Paul E. McKenney May 4, 2020, 6:07 p.m. UTC | #5
On Mon, May 04, 2020 at 07:48:22PM +0200, Uladzislau Rezki wrote:
> On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote:
> > On Mon, May 04, 2020 at 02:43:23PM +0200, Uladzislau Rezki wrote:
> > > On Fri, May 01, 2020 at 02:27:49PM -0700, Paul E. McKenney wrote:
> > > > On Tue, Apr 28, 2020 at 10:58:48PM +0200, Uladzislau Rezki (Sony) wrote:
> > > > > Cache some extra objects per-CPU. During reclaim process
> > > > > some pages are cached instead of releasing by linking them
> > > > > into the list. Such approach provides O(1) access time to
> > > > > the cache.
> > > > > 
> > > > > That reduces number of requests to the page allocator, also
> > > > > that makes it more helpful if a low memory condition occurs.
> > > > > 
> > > > > A parameter reflecting the minimum allowed pages to be
> > > > > cached per one CPU is propagated via sysfs, it is read
> > > > > only, the name is "rcu_min_cached_objs".
> > > > > 
> > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > > ---
> > > > >  kernel/rcu/tree.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
> > > > >  1 file changed, 60 insertions(+), 4 deletions(-)
> > > > > 
> > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > index 89e9ca3f4e3e..d8975819b1c9 100644
> > > > > --- a/kernel/rcu/tree.c
> > > > > +++ b/kernel/rcu/tree.c
> > > > > @@ -178,6 +178,14 @@ module_param(gp_init_delay, int, 0444);
> > > > >  static int gp_cleanup_delay;
> > > > >  module_param(gp_cleanup_delay, int, 0444);
> > > > >  
> > > > > +/*
> > > > > + * This rcu parameter is read-only, but can be write also.
> > > > 
> > > > You mean that although the parameter is read-only, you see no reason
> > > > why it could not be converted to writeable?
> > > > 
> > > I added just a note. If it is writable, then we can change the size of the
> > > per-CPU cache dynamically, i.e. "echo 5 > /sys/.../rcu_min_cached_objs"
> > > would cache 5 pages. But i do not have a strong opinion if it should be
> > > writable.
> > > 
> > > > If it was writeable, and a given CPU had the maximum numbr of cached
> > > > objects, the rcu_min_cached_objs value was decreased, but that CPU never
> > > > saw another kfree_rcu(), would the number of cached objects change?
> > > > 
> > > No. It works the way: unqueue the page from cache in the kfree_rcu(),
> > > whereas "rcu work" will put it back if number of objects < rcu_min_cached_objs,
> > > if >= will free the page.
> > 
> > Just to make sure I understand...  If someone writes a smaller number to
> > the sysfs variable, the per-CPU caches will be decreased at that point,
> > immediately during that sysfs write?  Or are you saying something else?
> > 
> This patch defines it as read-only. It defines the minimum threshold that
> controls number of elements in the per-CPU cache. If we decide to make it
> write also, then we will have full of freedom how to define its behavior,
> i.e. it is not defined because it is read only.

And runtime-read-only sounds like an excellent state for it.

> > > > Presumably the list can also be accessed without holding this lock,
> > > > because otherwise we shouldn't need llist...
> > > > 
> > > Hm... We increase the number of elements in cache, therefore it is not
> > > lockless. From the other hand i used llist_head to maintain the cache
> > > because it is single linked list, we do not need "*prev" link. Also
> > > we do not need to init the list.
> > > 
> > > But i can change it to list_head. Please let me know if i need :)
> > 
> > Hmmm...  Maybe it is time for a non-atomic singly linked list?  In the RCU
> > callback processing, the operations were open-coded, but they have been
> > pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*.
> > 
> > Maybe some non-atomic/protected/whatever macros in the llist.h file?
> > Or maybe just open-code the singly linked list?  (Probably not the
> > best choice, though.)  Add comments stating that the atomic properties
> > of the llist functions aren't neded?  Something else?
> >
> In order to keep it simple i can replace llist_head by the list_head?

Fine by me!

> > The comments would be a good start.  Just to take pity on people seeing
> > the potential for concurrency and wondering how the concurrent accesses
> > actually happen.  ;-)
> > 
> Sounds like you are kidding me :) 

"Only those who have gone too far can possibly tell you how far you
can go!"  ;-)

							Thanx, Paul
Joel Fernandes May 4, 2020, 6:08 p.m. UTC | #6
On Mon, May 04, 2020 at 07:48:22PM +0200, Uladzislau Rezki wrote:
> On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote:
[..] 
> > > > Presumably the list can also be accessed without holding this lock,
> > > > because otherwise we shouldn't need llist...
> > > > 
> > > Hm... We increase the number of elements in cache, therefore it is not
> > > lockless. From the other hand i used llist_head to maintain the cache
> > > because it is single linked list, we do not need "*prev" link. Also
> > > we do not need to init the list.
> > > 
> > > But i can change it to list_head. Please let me know if i need :)
> > 
> > Hmmm...  Maybe it is time for a non-atomic singly linked list?  In the RCU
> > callback processing, the operations were open-coded, but they have been
> > pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*.
> > 
> > Maybe some non-atomic/protected/whatever macros in the llist.h file?
> > Or maybe just open-code the singly linked list?  (Probably not the
> > best choice, though.)  Add comments stating that the atomic properties
> > of the llist functions aren't neded?  Something else?
> >
> In order to keep it simple i can replace llist_head by the list_head?

Just to clarify for me, what is the disadvantage of using llist here?

Since we don't care about traversing backwards, isn't it better to use llist
for this usecase?

I think Vlad is using locking as we're also tracking the size of the llist to
know when to free pages. This tracking could suffer from the lost-update
problem without any locking, 2 lockless llist_add happened simulatenously.

Also if list_head is used, it will take more space and still use locking.

Thoughts?

thanks,

 - Joel

> > 
> > The comments would be a good start.  Just to take pity on people seeing
> > the potential for concurrency and wondering how the concurrent accesses
> > actually happen.  ;-)
> > 
> Sounds like you are kidding me :) 
> 
> --
> Vlad Rezki
Paul E. McKenney May 4, 2020, 7:01 p.m. UTC | #7
On Mon, May 04, 2020 at 02:08:05PM -0400, Joel Fernandes wrote:
> On Mon, May 04, 2020 at 07:48:22PM +0200, Uladzislau Rezki wrote:
> > On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote:
> [..] 
> > > > > Presumably the list can also be accessed without holding this lock,
> > > > > because otherwise we shouldn't need llist...
> > > > > 
> > > > Hm... We increase the number of elements in cache, therefore it is not
> > > > lockless. From the other hand i used llist_head to maintain the cache
> > > > because it is single linked list, we do not need "*prev" link. Also
> > > > we do not need to init the list.
> > > > 
> > > > But i can change it to list_head. Please let me know if i need :)
> > > 
> > > Hmmm...  Maybe it is time for a non-atomic singly linked list?  In the RCU
> > > callback processing, the operations were open-coded, but they have been
> > > pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*.
> > > 
> > > Maybe some non-atomic/protected/whatever macros in the llist.h file?
> > > Or maybe just open-code the singly linked list?  (Probably not the
> > > best choice, though.)  Add comments stating that the atomic properties
> > > of the llist functions aren't neded?  Something else?
> > >
> > In order to keep it simple i can replace llist_head by the list_head?
> 
> Just to clarify for me, what is the disadvantage of using llist here?

Are there some llist APIs that are not set up for concurrency?  I am
not seeing any.

The overhead isn't that much of a concern, given that these are not on the
hotpath, but people reading the code and seeing the cmpxchg operations
might be forgiven for believing that there is some concurrency involved
somewhere.

Or am I confused and there are now single-threaded add/delete operations
for llist?

> Since we don't care about traversing backwards, isn't it better to use llist
> for this usecase?
> 
> I think Vlad is using locking as we're also tracking the size of the llist to
> know when to free pages. This tracking could suffer from the lost-update
> problem without any locking, 2 lockless llist_add happened simulatenously.
> 
> Also if list_head is used, it will take more space and still use locking.

Indeed, it would be best to use a non-concurrent singly linked list.

							Thanx, Paul

> Thoughts?
> 
> thanks,
> 
>  - Joel
> 
> > > 
> > > The comments would be a good start.  Just to take pity on people seeing
> > > the potential for concurrency and wondering how the concurrent accesses
> > > actually happen.  ;-)
> > > 
> > Sounds like you are kidding me :) 
> > 
> > --
> > Vlad Rezki
Joel Fernandes May 4, 2020, 7:37 p.m. UTC | #8
Hi Paul,

On Mon, May 4, 2020 at 3:01 PM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Mon, May 04, 2020 at 02:08:05PM -0400, Joel Fernandes wrote:
> > On Mon, May 04, 2020 at 07:48:22PM +0200, Uladzislau Rezki wrote:
> > > On Mon, May 04, 2020 at 08:24:37AM -0700, Paul E. McKenney wrote:
> > [..]
> > > > > > Presumably the list can also be accessed without holding this lock,
> > > > > > because otherwise we shouldn't need llist...
> > > > > >
> > > > > Hm... We increase the number of elements in cache, therefore it is not
> > > > > lockless. From the other hand i used llist_head to maintain the cache
> > > > > because it is single linked list, we do not need "*prev" link. Also
> > > > > we do not need to init the list.
> > > > >
> > > > > But i can change it to list_head. Please let me know if i need :)
> > > >
> > > > Hmmm...  Maybe it is time for a non-atomic singly linked list?  In the RCU
> > > > callback processing, the operations were open-coded, but they have been
> > > > pushed into include/linux/rcu_segcblist.h and kernel/rcu/rcu_segcblist.*.
> > > >
> > > > Maybe some non-atomic/protected/whatever macros in the llist.h file?
> > > > Or maybe just open-code the singly linked list?  (Probably not the
> > > > best choice, though.)  Add comments stating that the atomic properties
> > > > of the llist functions aren't neded?  Something else?
> > > >
> > > In order to keep it simple i can replace llist_head by the list_head?
> >
> > Just to clarify for me, what is the disadvantage of using llist here?
>
> Are there some llist APIs that are not set up for concurrency?  I am
> not seeing any.

llist deletion racing with another llist deletion will need locking.
So strictly speaking, some locking is possible with llist usage?

The locklessness as I understand comes when adding and deleting at the
same time. For that no lock is needed. But in the current patch, it
locks anyway to avoid the lost-update of the size of the list.

> The overhead isn't that much of a concern, given that these are not on the
> hotpath, but people reading the code and seeing the cmpxchg operations
> might be forgiven for believing that there is some concurrency involved
> somewhere.
>
> Or am I confused and there are now single-threaded add/delete operations
> for llist?

I do see some examples of llist usage with locking in the kernel code.
One case is: do_init_module() calling llist_add to add to the
init_free_list under module_mutex.

> > Since we don't care about traversing backwards, isn't it better to use llist
> > for this usecase?
> >
> > I think Vlad is using locking as we're also tracking the size of the llist to
> > know when to free pages. This tracking could suffer from the lost-update
> > problem without any locking, 2 lockless llist_add happened simulatenously.
> >
> > Also if list_head is used, it will take more space and still use locking.
>
> Indeed, it would be best to use a non-concurrent singly linked list.

Ok cool :-)

Is it safe to say something like the following is ruled out? ;-) :-D
#define kfree_rcu_list_add llist_add

Thanks,

 - Joel
Uladzislau Rezki May 4, 2020, 7:51 p.m. UTC | #9
> > > Since we don't care about traversing backwards, isn't it better to use llist
> > > for this usecase?
> > >
> > > I think Vlad is using locking as we're also tracking the size of the llist to
> > > know when to free pages. This tracking could suffer from the lost-update
> > > problem without any locking, 2 lockless llist_add happened simulatenously.
> > >
> > > Also if list_head is used, it will take more space and still use locking.
> >
> > Indeed, it would be best to use a non-concurrent singly linked list.
> 
> Ok cool :-)
> 
> Is it safe to say something like the following is ruled out? ;-) :-D
> #define kfree_rcu_list_add llist_add
> 
In that case i think it is better just to add a comment about using
llist_head. To state that it used as a singular list to save space
and the access is synchronized by the lock :)

IMHO.

--
Vlad Rezki
Joel Fernandes May 4, 2020, 8:15 p.m. UTC | #10
On May 4, 2020 3:51:28 PM EDT, Uladzislau Rezki <urezki@gmail.com> wrote:
>> > > Since we don't care about traversing backwards, isn't it better
>to use llist
>> > > for this usecase?
>> > >
>> > > I think Vlad is using locking as we're also tracking the size of
>the llist to
>> > > know when to free pages. This tracking could suffer from the
>lost-update
>> > > problem without any locking, 2 lockless llist_add happened
>simulatenously.
>> > >
>> > > Also if list_head is used, it will take more space and still use
>locking.
>> >
>> > Indeed, it would be best to use a non-concurrent singly linked
>list.
>> 
>> Ok cool :-)
>> 
>> Is it safe to say something like the following is ruled out? ;-) :-D
>> #define kfree_rcu_list_add llist_add
>> 
>In that case i think it is better just to add a comment about using
>llist_head. To state that it used as a singular list to save space
>and the access is synchronized by the lock :)
>
>IMHO.

Sounds good to me. thanks,

 - Joel

>
>--
>Vlad Rezki
Paul E. McKenney May 4, 2020, 8:16 p.m. UTC | #11
On Mon, May 04, 2020 at 09:51:28PM +0200, Uladzislau Rezki wrote:
> > > > Since we don't care about traversing backwards, isn't it better to use llist
> > > > for this usecase?
> > > >
> > > > I think Vlad is using locking as we're also tracking the size of the llist to
> > > > know when to free pages. This tracking could suffer from the lost-update
> > > > problem without any locking, 2 lockless llist_add happened simulatenously.
> > > >
> > > > Also if list_head is used, it will take more space and still use locking.
> > >
> > > Indeed, it would be best to use a non-concurrent singly linked list.
> > 
> > Ok cool :-)
> > 
> > Is it safe to say something like the following is ruled out? ;-) :-D
> > #define kfree_rcu_list_add llist_add
> > 
> In that case i think it is better just to add a comment about using
> llist_head. To state that it used as a singular list to save space
> and the access is synchronized by the lock :)
> 
> IMHO.

But adding such a comment would be fine as well.

							Thanx, Paul
Uladzislau Rezki May 5, 2020, 11:03 a.m. UTC | #12
On Mon, May 04, 2020 at 01:16:41PM -0700, Paul E. McKenney wrote:
> On Mon, May 04, 2020 at 09:51:28PM +0200, Uladzislau Rezki wrote:
> > > > > Since we don't care about traversing backwards, isn't it better to use llist
> > > > > for this usecase?
> > > > >
> > > > > I think Vlad is using locking as we're also tracking the size of the llist to
> > > > > know when to free pages. This tracking could suffer from the lost-update
> > > > > problem without any locking, 2 lockless llist_add happened simulatenously.
> > > > >
> > > > > Also if list_head is used, it will take more space and still use locking.
> > > >
> > > > Indeed, it would be best to use a non-concurrent singly linked list.
> > > 
> > > Ok cool :-)
> > > 
> > > Is it safe to say something like the following is ruled out? ;-) :-D
> > > #define kfree_rcu_list_add llist_add
> > > 
> > In that case i think it is better just to add a comment about using
> > llist_head. To state that it used as a singular list to save space
> > and the access is synchronized by the lock :)
> > 
> > IMHO.
> 
> But adding such a comment would be fine as well.
> 
Thank you Paul and Joel!

--
Vlad Rezki
diff mbox series

Patch

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 89e9ca3f4e3e..d8975819b1c9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -178,6 +178,14 @@  module_param(gp_init_delay, int, 0444);
 static int gp_cleanup_delay;
 module_param(gp_cleanup_delay, int, 0444);
 
+/*
+ * This rcu parameter is read-only, but can be write also.
+ * It reflects the minimum allowed number of objects which
+ * can be cached per-CPU. Object size is equal to one page.
+ */
+int rcu_min_cached_objs = 2;
+module_param(rcu_min_cached_objs, int, 0444);
+
 /* Retrieve RCU kthreads priority for rcutorture */
 int rcu_get_gp_kthreads_prio(void)
 {
@@ -2887,7 +2895,6 @@  struct kfree_rcu_cpu_work {
  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
  * @head: List of kfree_rcu() objects not yet waiting for a grace period
  * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
- * @bcached: Keeps at most one object for later reuse when build chain blocks
  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@@ -2902,7 +2909,6 @@  struct kfree_rcu_cpu_work {
 struct kfree_rcu_cpu {
 	struct rcu_head *head;
 	struct kfree_rcu_bulk_data *bhead;
-	struct kfree_rcu_bulk_data *bcached;
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
 	raw_spinlock_t lock;
 	struct delayed_work monitor_work;
@@ -2910,6 +2916,15 @@  struct kfree_rcu_cpu {
 	bool initialized;
 	// Number of objects for which GP not started
 	int count;
+
+	/*
+	 * Number of cached objects which are queued into
+	 * the lock-less list. This cache is used by the
+	 * kvfree_call_rcu() function and as of now its
+	 * size is static.
+	 */
+	struct llist_head bkvcache;
+	int nr_bkv_objs;
 };
 
 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
@@ -2946,6 +2961,31 @@  krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
 	local_irq_restore(flags);
 }
 
+static inline struct kfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+	if (!krcp->nr_bkv_objs)
+		return NULL;
+
+	krcp->nr_bkv_objs--;
+	return (struct kfree_rcu_bulk_data *)
+		llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+	struct kfree_rcu_bulk_data *bnode)
+{
+	/* Check the limit. */
+	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+		return false;
+
+	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+	krcp->nr_bkv_objs++;
+	return true;
+
+}
+
 /*
  * This function is invoked in workqueue context after a grace period.
  * It frees all the objects queued on ->bhead_free or ->head_free.
@@ -2981,7 +3021,12 @@  static void kfree_rcu_work(struct work_struct *work)
 		kfree_bulk(bhead->nr_records, bhead->records);
 		rcu_lock_release(&rcu_callback_map);
 
-		if (cmpxchg(&krcp->bcached, NULL, bhead))
+		krcp = krc_this_cpu_lock(&flags);
+		if (put_cached_bnode(krcp, bhead))
+			bhead = NULL;
+		krc_this_cpu_unlock(krcp, flags);
+
+		if (bhead)
 			free_page((unsigned long) bhead);
 
 		cond_resched_tasks_rcu_qs();
@@ -3114,7 +3159,7 @@  kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 	/* Check if a new block is required. */
 	if (!krcp->bhead ||
 			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
-		bnode = xchg(&krcp->bcached, NULL);
+		bnode = get_cached_bnode(krcp);
 		if (!bnode) {
 			WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
 
@@ -4167,12 +4212,23 @@  static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+		struct kfree_rcu_bulk_data *bnode;
 
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
 		}
 
+		for (i = 0; i < rcu_min_cached_objs; i++) {
+			bnode = (struct kfree_rcu_bulk_data *)
+				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+			if (bnode)
+				put_cached_bnode(krcp, bnode);
+			else
+				pr_err("Failed to preallocate for %d CPU!\n", cpu);
+		}
+
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 		krcp->initialized = true;
 	}