diff mbox series

[v2] rcu/kvfree: Add kvfree_rcu_barrier() API

Message ID 20240820155935.1167988-1-urezki@gmail.com (mailing list archive)
State New
Headers show
Series [v2] rcu/kvfree: Add kvfree_rcu_barrier() API | expand

Commit Message

Uladzislau Rezki Aug. 20, 2024, 3:59 p.m. UTC
Add a kvfree_rcu_barrier() function. It waits until all
in-flight pointers are freed over RCU machinery. It does
not wait any GP completion and it is within its right to
return immediately if there are no outstanding pointers.

This function is useful when there is a need to guarantee
that a memory is fully freed before destroying memory caches.
For example, during unloading a kernel module.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/rcutiny.h |   5 ++
 include/linux/rcutree.h |   1 +
 kernel/rcu/tree.c       | 109 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 107 insertions(+), 8 deletions(-)

Comments

Vlastimil Babka Aug. 21, 2024, 6:06 p.m. UTC | #1
On 8/20/24 5:59 PM, Uladzislau Rezki (Sony) wrote:
> Add a kvfree_rcu_barrier() function. It waits until all
> in-flight pointers are freed over RCU machinery. It does
> not wait any GP completion and it is within its right to
> return immediately if there are no outstanding pointers.
> 
> This function is useful when there is a need to guarantee
> that a memory is fully freed before destroying memory caches.
> For example, during unloading a kernel module.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Thanks Ulad, replaced the patch in slab/for-next

> ---
>  include/linux/rcutiny.h |   5 ++
>  include/linux/rcutree.h |   1 +
>  kernel/rcu/tree.c       | 109 +++++++++++++++++++++++++++++++++++++---
>  3 files changed, 107 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
> index d9ac7b136aea..522123050ff8 100644
> --- a/include/linux/rcutiny.h
> +++ b/include/linux/rcutiny.h
> @@ -111,6 +111,11 @@ static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr)
>  	kvfree(ptr);
>  }
>  
> +static inline void kvfree_rcu_barrier(void)
> +{
> +	rcu_barrier();
> +}
> +
>  #ifdef CONFIG_KASAN_GENERIC
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr);
>  #else
> diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
> index 254244202ea9..58e7db80f3a8 100644
> --- a/include/linux/rcutree.h
> +++ b/include/linux/rcutree.h
> @@ -35,6 +35,7 @@ static inline void rcu_virt_note_context_switch(void)
>  
>  void synchronize_rcu_expedited(void);
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr);
> +void kvfree_rcu_barrier(void);
>  
>  void rcu_barrier(void);
>  void rcu_momentary_dyntick_idle(void);
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index e641cc681901..be00aac5f4e7 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3584,18 +3584,15 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
>  }
>  
>  /*
> - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
> + * Return: %true if a work is queued, %false otherwise.
>   */
> -static void kfree_rcu_monitor(struct work_struct *work)
> +static bool
> +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
>  {
> -	struct kfree_rcu_cpu *krcp = container_of(work,
> -		struct kfree_rcu_cpu, monitor_work.work);
>  	unsigned long flags;
> +	bool queued = false;
>  	int i, j;
>  
> -	// Drain ready for reclaim.
> -	kvfree_rcu_drain_ready(krcp);
> -
>  	raw_spin_lock_irqsave(&krcp->lock, flags);
>  
>  	// Attempt to start a new batch.
> @@ -3634,11 +3631,27 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  			// be that the work is in the pending state when
>  			// channels have been detached following by each
>  			// other.
> -			queue_rcu_work(system_wq, &krwp->rcu_work);
> +			queued = queue_rcu_work(system_wq, &krwp->rcu_work);
>  		}
>  	}
>  
>  	raw_spin_unlock_irqrestore(&krcp->lock, flags);
> +	return queued;
> +}
> +
> +/*
> + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
> + */
> +static void kfree_rcu_monitor(struct work_struct *work)
> +{
> +	struct kfree_rcu_cpu *krcp = container_of(work,
> +		struct kfree_rcu_cpu, monitor_work.work);
> +
> +	// Drain ready for reclaim.
> +	kvfree_rcu_drain_ready(krcp);
> +
> +	// Queue a batch for a rest.
> +	kvfree_rcu_queue_batch(krcp);
>  
>  	// If there is nothing to detach, it means that our job is
>  	// successfully done here. In case of having at least one
> @@ -3859,6 +3872,86 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
>  }
>  EXPORT_SYMBOL_GPL(kvfree_call_rcu);
>  
> +/**
> + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
> + *
> + * Note that a single argument of kvfree_rcu() call has a slow path that
> + * triggers synchronize_rcu() following by freeing a pointer. It is done
> + * before the return from the function. Therefore for any single-argument
> + * call that will result in a kfree() to a cache that is to be destroyed
> + * during module exit, it is developer's responsibility to ensure that all
> + * such calls have returned before the call to kmem_cache_destroy().
> + */
> +void kvfree_rcu_barrier(void)
> +{
> +	struct kfree_rcu_cpu_work *krwp;
> +	struct kfree_rcu_cpu *krcp;
> +	bool queued;
> +	int i, cpu;
> +
> +	/*
> +	 * Firstly we detach objects and queue them over an RCU-batch
> +	 * for all CPUs. Finally queued works are flushed for each CPU.
> +	 *
> +	 * Please note. If there are outstanding batches for a particular
> +	 * CPU, those have to be finished first following by queuing a new.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		krcp = per_cpu_ptr(&krc, cpu);
> +
> +		/*
> +		 * Check if this CPU has any objects which have been queued for a
> +		 * new GP completion. If not(means nothing to detach), we are done
> +		 * with it. If any batch is pending/running for this "krcp", below
> +		 * per-cpu flush_rcu_work() waits its completion(see last step).
> +		 */
> +		if (!need_offload_krc(krcp))
> +			continue;
> +
> +		while (1) {
> +			/*
> +			 * If we are not able to queue a new RCU work it means:
> +			 * - batches for this CPU are still in flight which should
> +			 *   be flushed first and then repeat;
> +			 * - no objects to detach, because of concurrency.
> +			 */
> +			queued = kvfree_rcu_queue_batch(krcp);
> +
> +			/*
> +			 * Bail out, if there is no need to offload this "krcp"
> +			 * anymore. As noted earlier it can run concurrently.
> +			 */
> +			if (queued || !need_offload_krc(krcp))
> +				break;
> +
> +			/* There are ongoing batches. */
> +			for (i = 0; i < KFREE_N_BATCHES; i++) {
> +				krwp = &(krcp->krw_arr[i]);
> +				flush_rcu_work(&krwp->rcu_work);
> +			}
> +		}
> +	}
> +
> +	/*
> +	 * Now we guarantee that all objects are flushed.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		krcp = per_cpu_ptr(&krc, cpu);
> +
> +		/*
> +		 * A monitor work can drain ready to reclaim objects
> +		 * directly. Wait its completion if running or pending.
> +		 */
> +		cancel_delayed_work_sync(&krcp->monitor_work);
> +
> +		for (i = 0; i < KFREE_N_BATCHES; i++) {
> +			krwp = &(krcp->krw_arr[i]);
> +			flush_rcu_work(&krwp->rcu_work);
> +		}
> +	}
> +}
> +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
> +
>  static unsigned long
>  kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
>  {
Uladzislau Rezki Aug. 22, 2024, 12:04 p.m. UTC | #2
On Wed, Aug 21, 2024 at 08:06:08PM +0200, Vlastimil Babka wrote:
> On 8/20/24 5:59 PM, Uladzislau Rezki (Sony) wrote:
> > Add a kvfree_rcu_barrier() function. It waits until all
> > in-flight pointers are freed over RCU machinery. It does
> > not wait any GP completion and it is within its right to
> > return immediately if there are no outstanding pointers.
> > 
> > This function is useful when there is a need to guarantee
> > that a memory is fully freed before destroying memory caches.
> > For example, during unloading a kernel module.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Thanks Ulad, replaced the patch in slab/for-next
> 
You are welcome :)

--
Uladzislau Rezki
Paul E. McKenney Sept. 10, 2024, 3:42 p.m. UTC | #3
On Tue, Aug 20, 2024 at 05:59:35PM +0200, Uladzislau Rezki (Sony) wrote:
> Add a kvfree_rcu_barrier() function. It waits until all
> in-flight pointers are freed over RCU machinery. It does
> not wait any GP completion and it is within its right to
> return immediately if there are no outstanding pointers.
> 
> This function is useful when there is a need to guarantee
> that a memory is fully freed before destroying memory caches.
> For example, during unloading a kernel module.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

As a follow-on patch, once kvfree_rcu_barrier() is accepted into
mainline, should we add a call to kvfree_rcu_barrier() to the
rcu_barrier_throttled() function in kernel/rcu/tree.c?

This would allow the do_rcu_barrier module parameter to be used to clear
out kfree_rcu() as well as call_rcu() work.  This would be useful to
people running userspace benchmarks that cause the kernel to do a lot
of kfree_rcu() calls.  Always good to avoid messing up the results from
the current run due to deferred work from the previous run.  Even better
would be to actually account for the deferred work, but do_rcu_barrier
can help with that as well.  ;-)

Thoughts?

							Thanx, Paul

> ---
>  include/linux/rcutiny.h |   5 ++
>  include/linux/rcutree.h |   1 +
>  kernel/rcu/tree.c       | 109 +++++++++++++++++++++++++++++++++++++---
>  3 files changed, 107 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
> index d9ac7b136aea..522123050ff8 100644
> --- a/include/linux/rcutiny.h
> +++ b/include/linux/rcutiny.h
> @@ -111,6 +111,11 @@ static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr)
>  	kvfree(ptr);
>  }
>  
> +static inline void kvfree_rcu_barrier(void)
> +{
> +	rcu_barrier();
> +}
> +
>  #ifdef CONFIG_KASAN_GENERIC
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr);
>  #else
> diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
> index 254244202ea9..58e7db80f3a8 100644
> --- a/include/linux/rcutree.h
> +++ b/include/linux/rcutree.h
> @@ -35,6 +35,7 @@ static inline void rcu_virt_note_context_switch(void)
>  
>  void synchronize_rcu_expedited(void);
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr);
> +void kvfree_rcu_barrier(void);
>  
>  void rcu_barrier(void);
>  void rcu_momentary_dyntick_idle(void);
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index e641cc681901..be00aac5f4e7 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3584,18 +3584,15 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
>  }
>  
>  /*
> - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
> + * Return: %true if a work is queued, %false otherwise.
>   */
> -static void kfree_rcu_monitor(struct work_struct *work)
> +static bool
> +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
>  {
> -	struct kfree_rcu_cpu *krcp = container_of(work,
> -		struct kfree_rcu_cpu, monitor_work.work);
>  	unsigned long flags;
> +	bool queued = false;
>  	int i, j;
>  
> -	// Drain ready for reclaim.
> -	kvfree_rcu_drain_ready(krcp);
> -
>  	raw_spin_lock_irqsave(&krcp->lock, flags);
>  
>  	// Attempt to start a new batch.
> @@ -3634,11 +3631,27 @@ static void kfree_rcu_monitor(struct work_struct *work)
>  			// be that the work is in the pending state when
>  			// channels have been detached following by each
>  			// other.
> -			queue_rcu_work(system_wq, &krwp->rcu_work);
> +			queued = queue_rcu_work(system_wq, &krwp->rcu_work);
>  		}
>  	}
>  
>  	raw_spin_unlock_irqrestore(&krcp->lock, flags);
> +	return queued;
> +}
> +
> +/*
> + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
> + */
> +static void kfree_rcu_monitor(struct work_struct *work)
> +{
> +	struct kfree_rcu_cpu *krcp = container_of(work,
> +		struct kfree_rcu_cpu, monitor_work.work);
> +
> +	// Drain ready for reclaim.
> +	kvfree_rcu_drain_ready(krcp);
> +
> +	// Queue a batch for a rest.
> +	kvfree_rcu_queue_batch(krcp);
>  
>  	// If there is nothing to detach, it means that our job is
>  	// successfully done here. In case of having at least one
> @@ -3859,6 +3872,86 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
>  }
>  EXPORT_SYMBOL_GPL(kvfree_call_rcu);
>  
> +/**
> + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
> + *
> + * Note that a single argument of kvfree_rcu() call has a slow path that
> + * triggers synchronize_rcu() following by freeing a pointer. It is done
> + * before the return from the function. Therefore for any single-argument
> + * call that will result in a kfree() to a cache that is to be destroyed
> + * during module exit, it is developer's responsibility to ensure that all
> + * such calls have returned before the call to kmem_cache_destroy().
> + */
> +void kvfree_rcu_barrier(void)
> +{
> +	struct kfree_rcu_cpu_work *krwp;
> +	struct kfree_rcu_cpu *krcp;
> +	bool queued;
> +	int i, cpu;
> +
> +	/*
> +	 * Firstly we detach objects and queue them over an RCU-batch
> +	 * for all CPUs. Finally queued works are flushed for each CPU.
> +	 *
> +	 * Please note. If there are outstanding batches for a particular
> +	 * CPU, those have to be finished first following by queuing a new.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		krcp = per_cpu_ptr(&krc, cpu);
> +
> +		/*
> +		 * Check if this CPU has any objects which have been queued for a
> +		 * new GP completion. If not(means nothing to detach), we are done
> +		 * with it. If any batch is pending/running for this "krcp", below
> +		 * per-cpu flush_rcu_work() waits its completion(see last step).
> +		 */
> +		if (!need_offload_krc(krcp))
> +			continue;
> +
> +		while (1) {
> +			/*
> +			 * If we are not able to queue a new RCU work it means:
> +			 * - batches for this CPU are still in flight which should
> +			 *   be flushed first and then repeat;
> +			 * - no objects to detach, because of concurrency.
> +			 */
> +			queued = kvfree_rcu_queue_batch(krcp);
> +
> +			/*
> +			 * Bail out, if there is no need to offload this "krcp"
> +			 * anymore. As noted earlier it can run concurrently.
> +			 */
> +			if (queued || !need_offload_krc(krcp))
> +				break;
> +
> +			/* There are ongoing batches. */
> +			for (i = 0; i < KFREE_N_BATCHES; i++) {
> +				krwp = &(krcp->krw_arr[i]);
> +				flush_rcu_work(&krwp->rcu_work);
> +			}
> +		}
> +	}
> +
> +	/*
> +	 * Now we guarantee that all objects are flushed.
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		krcp = per_cpu_ptr(&krc, cpu);
> +
> +		/*
> +		 * A monitor work can drain ready to reclaim objects
> +		 * directly. Wait its completion if running or pending.
> +		 */
> +		cancel_delayed_work_sync(&krcp->monitor_work);
> +
> +		for (i = 0; i < KFREE_N_BATCHES; i++) {
> +			krwp = &(krcp->krw_arr[i]);
> +			flush_rcu_work(&krwp->rcu_work);
> +		}
> +	}
> +}
> +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
> +
>  static unsigned long
>  kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
>  {
> -- 
> 2.39.2
>
Uladzislau Rezki Sept. 11, 2024, 9:43 a.m. UTC | #4
On Tue, Sep 10, 2024 at 08:42:54AM -0700, Paul E. McKenney wrote:
> On Tue, Aug 20, 2024 at 05:59:35PM +0200, Uladzislau Rezki (Sony) wrote:
> > Add a kvfree_rcu_barrier() function. It waits until all
> > in-flight pointers are freed over RCU machinery. It does
> > not wait any GP completion and it is within its right to
> > return immediately if there are no outstanding pointers.
> > 
> > This function is useful when there is a need to guarantee
> > that a memory is fully freed before destroying memory caches.
> > For example, during unloading a kernel module.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> As a follow-on patch, once kvfree_rcu_barrier() is accepted into
> mainline, should we add a call to kvfree_rcu_barrier() to the
> rcu_barrier_throttled() function in kernel/rcu/tree.c?
> 
> This would allow the do_rcu_barrier module parameter to be used to clear
> out kfree_rcu() as well as call_rcu() work.  This would be useful to
> people running userspace benchmarks that cause the kernel to do a lot
> of kfree_rcu() calls.  Always good to avoid messing up the results from
> the current run due to deferred work from the previous run.  Even better
> would be to actually account for the deferred work, but do_rcu_barrier
> can help with that as well.  ;-)
> 
> Thoughts?
>
Makes sense. To be make sure that all objects are flushed. And as you
mentioned it is good to have it for benchmarking as a return to a baseline
point.

One issue is probably a "name" which would be common for both:

rcu_barrier()
kvfree_rcu_barrier()

i mean /sys/module/rcutree/parameters/do_rcu_barrier. From how i
would see it, it is supposed to trigger just rcu_barrier() API.

--
Uladzislau Rezki
Paul E. McKenney Sept. 11, 2024, 10:39 a.m. UTC | #5
On Wed, Sep 11, 2024 at 11:43:54AM +0200, Uladzislau Rezki wrote:
> On Tue, Sep 10, 2024 at 08:42:54AM -0700, Paul E. McKenney wrote:
> > On Tue, Aug 20, 2024 at 05:59:35PM +0200, Uladzislau Rezki (Sony) wrote:
> > > Add a kvfree_rcu_barrier() function. It waits until all
> > > in-flight pointers are freed over RCU machinery. It does
> > > not wait any GP completion and it is within its right to
> > > return immediately if there are no outstanding pointers.
> > > 
> > > This function is useful when there is a need to guarantee
> > > that a memory is fully freed before destroying memory caches.
> > > For example, during unloading a kernel module.
> > > 
> > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> > 
> > As a follow-on patch, once kvfree_rcu_barrier() is accepted into
> > mainline, should we add a call to kvfree_rcu_barrier() to the
> > rcu_barrier_throttled() function in kernel/rcu/tree.c?
> > 
> > This would allow the do_rcu_barrier module parameter to be used to clear
> > out kfree_rcu() as well as call_rcu() work.  This would be useful to
> > people running userspace benchmarks that cause the kernel to do a lot
> > of kfree_rcu() calls.  Always good to avoid messing up the results from
> > the current run due to deferred work from the previous run.  Even better
> > would be to actually account for the deferred work, but do_rcu_barrier
> > can help with that as well.  ;-)
> > 
> > Thoughts?
> >
> Makes sense. To be make sure that all objects are flushed. And as you
> mentioned it is good to have it for benchmarking as a return to a baseline
> point.
> 
> One issue is probably a "name" which would be common for both:
> 
> rcu_barrier()
> kvfree_rcu_barrier()
> 
> i mean /sys/module/rcutree/parameters/do_rcu_barrier. From how i
> would see it, it is supposed to trigger just rcu_barrier() API.

One approach would be to keep the old functionality, but create
a new sysfs variable that does both.  Except that to avoid code
duplication, we would likely end up with both actually doing
both.

Another approach is to rename the sysfs variable.  This might
work if there are not too many people using it.  Might.  ;-)

Other approaches?

						Thanx, Paul
Uladzislau Rezki Sept. 12, 2024, 4:16 p.m. UTC | #6
On Wed, Sep 11, 2024 at 03:39:19AM -0700, Paul E. McKenney wrote:
> On Wed, Sep 11, 2024 at 11:43:54AM +0200, Uladzislau Rezki wrote:
> > On Tue, Sep 10, 2024 at 08:42:54AM -0700, Paul E. McKenney wrote:
> > > On Tue, Aug 20, 2024 at 05:59:35PM +0200, Uladzislau Rezki (Sony) wrote:
> > > > Add a kvfree_rcu_barrier() function. It waits until all
> > > > in-flight pointers are freed over RCU machinery. It does
> > > > not wait any GP completion and it is within its right to
> > > > return immediately if there are no outstanding pointers.
> > > > 
> > > > This function is useful when there is a need to guarantee
> > > > that a memory is fully freed before destroying memory caches.
> > > > For example, during unloading a kernel module.
> > > > 
> > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> > > 
> > > As a follow-on patch, once kvfree_rcu_barrier() is accepted into
> > > mainline, should we add a call to kvfree_rcu_barrier() to the
> > > rcu_barrier_throttled() function in kernel/rcu/tree.c?
> > > 
> > > This would allow the do_rcu_barrier module parameter to be used to clear
> > > out kfree_rcu() as well as call_rcu() work.  This would be useful to
> > > people running userspace benchmarks that cause the kernel to do a lot
> > > of kfree_rcu() calls.  Always good to avoid messing up the results from
> > > the current run due to deferred work from the previous run.  Even better
> > > would be to actually account for the deferred work, but do_rcu_barrier
> > > can help with that as well.  ;-)
> > > 
> > > Thoughts?
> > >
> > Makes sense. To be make sure that all objects are flushed. And as you
> > mentioned it is good to have it for benchmarking as a return to a baseline
> > point.
> > 
> > One issue is probably a "name" which would be common for both:
> > 
> > rcu_barrier()
> > kvfree_rcu_barrier()
> > 
> > i mean /sys/module/rcutree/parameters/do_rcu_barrier. From how i
> > would see it, it is supposed to trigger just rcu_barrier() API.
> 
> One approach would be to keep the old functionality, but create
> a new sysfs variable that does both.  Except that to avoid code
> duplication, we would likely end up with both actually doing
> both.
> 
> Another approach is to rename the sysfs variable.  This might
> work if there are not too many people using it.  Might.  ;-)
> 
> Other approaches?
> 
Maybe just rename from/to: do_rcu_barrier -> do_barrier? Probably this
would be the best, but as you noted, there might be users :)

To be safe, we can add kvfree_rcu_barrier() to the rcu_barrier_throttled() and  
document that it does both now!

--
Uladzislau Rezki
Paul E. McKenney Sept. 12, 2024, 6:07 p.m. UTC | #7
On Thu, Sep 12, 2024 at 06:16:56PM +0200, Uladzislau Rezki wrote:
> On Wed, Sep 11, 2024 at 03:39:19AM -0700, Paul E. McKenney wrote:
> > On Wed, Sep 11, 2024 at 11:43:54AM +0200, Uladzislau Rezki wrote:
> > > On Tue, Sep 10, 2024 at 08:42:54AM -0700, Paul E. McKenney wrote:
> > > > On Tue, Aug 20, 2024 at 05:59:35PM +0200, Uladzislau Rezki (Sony) wrote:
> > > > > Add a kvfree_rcu_barrier() function. It waits until all
> > > > > in-flight pointers are freed over RCU machinery. It does
> > > > > not wait any GP completion and it is within its right to
> > > > > return immediately if there are no outstanding pointers.
> > > > > 
> > > > > This function is useful when there is a need to guarantee
> > > > > that a memory is fully freed before destroying memory caches.
> > > > > For example, during unloading a kernel module.
> > > > > 
> > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> > > > 
> > > > As a follow-on patch, once kvfree_rcu_barrier() is accepted into
> > > > mainline, should we add a call to kvfree_rcu_barrier() to the
> > > > rcu_barrier_throttled() function in kernel/rcu/tree.c?
> > > > 
> > > > This would allow the do_rcu_barrier module parameter to be used to clear
> > > > out kfree_rcu() as well as call_rcu() work.  This would be useful to
> > > > people running userspace benchmarks that cause the kernel to do a lot
> > > > of kfree_rcu() calls.  Always good to avoid messing up the results from
> > > > the current run due to deferred work from the previous run.  Even better
> > > > would be to actually account for the deferred work, but do_rcu_barrier
> > > > can help with that as well.  ;-)
> > > > 
> > > > Thoughts?
> > > >
> > > Makes sense. To be make sure that all objects are flushed. And as you
> > > mentioned it is good to have it for benchmarking as a return to a baseline
> > > point.
> > > 
> > > One issue is probably a "name" which would be common for both:
> > > 
> > > rcu_barrier()
> > > kvfree_rcu_barrier()
> > > 
> > > i mean /sys/module/rcutree/parameters/do_rcu_barrier. From how i
> > > would see it, it is supposed to trigger just rcu_barrier() API.
> > 
> > One approach would be to keep the old functionality, but create
> > a new sysfs variable that does both.  Except that to avoid code
> > duplication, we would likely end up with both actually doing
> > both.
> > 
> > Another approach is to rename the sysfs variable.  This might
> > work if there are not too many people using it.  Might.  ;-)
> > 
> > Other approaches?
> > 
> Maybe just rename from/to: do_rcu_barrier -> do_barrier? Probably this
> would be the best, but as you noted, there might be users :)
> 
> To be safe, we can add kvfree_rcu_barrier() to the rcu_barrier_throttled() and  
> document that it does both now!

That does sound safest to me.  We just might find that our users (if any)
expected that it already did both.  ;-)

							Thanx, Paul
Uladzislau Rezki Sept. 13, 2024, 12:24 p.m. UTC | #8
On Thu, Sep 12, 2024 at 11:07:24AM -0700, Paul E. McKenney wrote:
> On Thu, Sep 12, 2024 at 06:16:56PM +0200, Uladzislau Rezki wrote:
> > On Wed, Sep 11, 2024 at 03:39:19AM -0700, Paul E. McKenney wrote:
> > > On Wed, Sep 11, 2024 at 11:43:54AM +0200, Uladzislau Rezki wrote:
> > > > On Tue, Sep 10, 2024 at 08:42:54AM -0700, Paul E. McKenney wrote:
> > > > > On Tue, Aug 20, 2024 at 05:59:35PM +0200, Uladzislau Rezki (Sony) wrote:
> > > > > > Add a kvfree_rcu_barrier() function. It waits until all
> > > > > > in-flight pointers are freed over RCU machinery. It does
> > > > > > not wait any GP completion and it is within its right to
> > > > > > return immediately if there are no outstanding pointers.
> > > > > > 
> > > > > > This function is useful when there is a need to guarantee
> > > > > > that a memory is fully freed before destroying memory caches.
> > > > > > For example, during unloading a kernel module.
> > > > > > 
> > > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > > > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> > > > > 
> > > > > As a follow-on patch, once kvfree_rcu_barrier() is accepted into
> > > > > mainline, should we add a call to kvfree_rcu_barrier() to the
> > > > > rcu_barrier_throttled() function in kernel/rcu/tree.c?
> > > > > 
> > > > > This would allow the do_rcu_barrier module parameter to be used to clear
> > > > > out kfree_rcu() as well as call_rcu() work.  This would be useful to
> > > > > people running userspace benchmarks that cause the kernel to do a lot
> > > > > of kfree_rcu() calls.  Always good to avoid messing up the results from
> > > > > the current run due to deferred work from the previous run.  Even better
> > > > > would be to actually account for the deferred work, but do_rcu_barrier
> > > > > can help with that as well.  ;-)
> > > > > 
> > > > > Thoughts?
> > > > >
> > > > Makes sense. To be make sure that all objects are flushed. And as you
> > > > mentioned it is good to have it for benchmarking as a return to a baseline
> > > > point.
> > > > 
> > > > One issue is probably a "name" which would be common for both:
> > > > 
> > > > rcu_barrier()
> > > > kvfree_rcu_barrier()
> > > > 
> > > > i mean /sys/module/rcutree/parameters/do_rcu_barrier. From how i
> > > > would see it, it is supposed to trigger just rcu_barrier() API.
> > > 
> > > One approach would be to keep the old functionality, but create
> > > a new sysfs variable that does both.  Except that to avoid code
> > > duplication, we would likely end up with both actually doing
> > > both.
> > > 
> > > Another approach is to rename the sysfs variable.  This might
> > > work if there are not too many people using it.  Might.  ;-)
> > > 
> > > Other approaches?
> > > 
> > Maybe just rename from/to: do_rcu_barrier -> do_barrier? Probably this
> > would be the best, but as you noted, there might be users :)
> > 
> > To be safe, we can add kvfree_rcu_barrier() to the rcu_barrier_throttled() and  
> > document that it does both now!
> 
> That does sound safest to me.  We just might find that our users (if any)
> expected that it already did both.  ;-)
> 
Good. Then i will send out the patch!

--
Uladzislau Rezki
diff mbox series

Patch

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index d9ac7b136aea..522123050ff8 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -111,6 +111,11 @@  static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr)
 	kvfree(ptr);
 }
 
+static inline void kvfree_rcu_barrier(void)
+{
+	rcu_barrier();
+}
+
 #ifdef CONFIG_KASAN_GENERIC
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
 #else
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 254244202ea9..58e7db80f3a8 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,6 +35,7 @@  static inline void rcu_virt_note_context_switch(void)
 
 void synchronize_rcu_expedited(void);
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
+void kvfree_rcu_barrier(void);
 
 void rcu_barrier(void);
 void rcu_momentary_dyntick_idle(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e641cc681901..be00aac5f4e7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3584,18 +3584,15 @@  kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
 }
 
 /*
- * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ * Return: %true if a work is queued, %false otherwise.
  */
-static void kfree_rcu_monitor(struct work_struct *work)
+static bool
+kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
 {
-	struct kfree_rcu_cpu *krcp = container_of(work,
-		struct kfree_rcu_cpu, monitor_work.work);
 	unsigned long flags;
+	bool queued = false;
 	int i, j;
 
-	// Drain ready for reclaim.
-	kvfree_rcu_drain_ready(krcp);
-
 	raw_spin_lock_irqsave(&krcp->lock, flags);
 
 	// Attempt to start a new batch.
@@ -3634,11 +3631,27 @@  static void kfree_rcu_monitor(struct work_struct *work)
 			// be that the work is in the pending state when
 			// channels have been detached following by each
 			// other.
-			queue_rcu_work(system_wq, &krwp->rcu_work);
+			queued = queue_rcu_work(system_wq, &krwp->rcu_work);
 		}
 	}
 
 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
+	return queued;
+}
+
+/*
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ */
+static void kfree_rcu_monitor(struct work_struct *work)
+{
+	struct kfree_rcu_cpu *krcp = container_of(work,
+		struct kfree_rcu_cpu, monitor_work.work);
+
+	// Drain ready for reclaim.
+	kvfree_rcu_drain_ready(krcp);
+
+	// Queue a batch for a rest.
+	kvfree_rcu_queue_batch(krcp);
 
 	// If there is nothing to detach, it means that our job is
 	// successfully done here. In case of having at least one
@@ -3859,6 +3872,86 @@  void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 }
 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
 
+/**
+ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+ *
+ * Note that a single argument of kvfree_rcu() call has a slow path that
+ * triggers synchronize_rcu() following by freeing a pointer. It is done
+ * before the return from the function. Therefore for any single-argument
+ * call that will result in a kfree() to a cache that is to be destroyed
+ * during module exit, it is developer's responsibility to ensure that all
+ * such calls have returned before the call to kmem_cache_destroy().
+ */
+void kvfree_rcu_barrier(void)
+{
+	struct kfree_rcu_cpu_work *krwp;
+	struct kfree_rcu_cpu *krcp;
+	bool queued;
+	int i, cpu;
+
+	/*
+	 * Firstly we detach objects and queue them over an RCU-batch
+	 * for all CPUs. Finally queued works are flushed for each CPU.
+	 *
+	 * Please note. If there are outstanding batches for a particular
+	 * CPU, those have to be finished first following by queuing a new.
+	 */
+	for_each_possible_cpu(cpu) {
+		krcp = per_cpu_ptr(&krc, cpu);
+
+		/*
+		 * Check if this CPU has any objects which have been queued for a
+		 * new GP completion. If not(means nothing to detach), we are done
+		 * with it. If any batch is pending/running for this "krcp", below
+		 * per-cpu flush_rcu_work() waits its completion(see last step).
+		 */
+		if (!need_offload_krc(krcp))
+			continue;
+
+		while (1) {
+			/*
+			 * If we are not able to queue a new RCU work it means:
+			 * - batches for this CPU are still in flight which should
+			 *   be flushed first and then repeat;
+			 * - no objects to detach, because of concurrency.
+			 */
+			queued = kvfree_rcu_queue_batch(krcp);
+
+			/*
+			 * Bail out, if there is no need to offload this "krcp"
+			 * anymore. As noted earlier it can run concurrently.
+			 */
+			if (queued || !need_offload_krc(krcp))
+				break;
+
+			/* There are ongoing batches. */
+			for (i = 0; i < KFREE_N_BATCHES; i++) {
+				krwp = &(krcp->krw_arr[i]);
+				flush_rcu_work(&krwp->rcu_work);
+			}
+		}
+	}
+
+	/*
+	 * Now we guarantee that all objects are flushed.
+	 */
+	for_each_possible_cpu(cpu) {
+		krcp = per_cpu_ptr(&krc, cpu);
+
+		/*
+		 * A monitor work can drain ready to reclaim objects
+		 * directly. Wait its completion if running or pending.
+		 */
+		cancel_delayed_work_sync(&krcp->monitor_work);
+
+		for (i = 0; i < KFREE_N_BATCHES; i++) {
+			krwp = &(krcp->krw_arr[i]);
+			flush_rcu_work(&krwp->rcu_work);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
+
 static unsigned long
 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {