diff mbox series

[v1,08/15] net: add helper executing custom callback from napi

Message ID 20241007221603.1703699-9-dw@davidwei.uk (mailing list archive)
State New
Headers show
Series io_uring zero copy rx | expand

Commit Message

David Wei Oct. 7, 2024, 10:15 p.m. UTC
From: Pavel Begunkov <asml.silence@gmail.com>

It's useful to have napi private bits and pieces like page pool's fast
allocating cache, so that the hot allocation path doesn't have to do any
additional synchronisation. In case of io_uring memory provider
introduced in following patches, we keep the consumer end of the
io_uring's refill queue private to napi as it's a hot path.

However, from time to time we need to synchronise with the napi, for
example to add more user memory or allocate fallback buffers. Add a
helper function napi_execute that allows to run a custom callback from
under napi context so that it can access and modify napi protected
parts of io_uring. It works similar to busy polling and stops napi from
running in the meantime, so it's supposed to be a slow control path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
---
 include/net/busy_poll.h |  6 +++++
 net/core/dev.c          | 53 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

Comments

Joe Damato Oct. 8, 2024, 10:25 p.m. UTC | #1
On Mon, Oct 07, 2024 at 03:15:56PM -0700, David Wei wrote:
> From: Pavel Begunkov <asml.silence@gmail.com>

[...]

> However, from time to time we need to synchronise with the napi, for
> example to add more user memory or allocate fallback buffers. Add a
> helper function napi_execute that allows to run a custom callback from
> under napi context so that it can access and modify napi protected
> parts of io_uring. It works similar to busy polling and stops napi from
> running in the meantime, so it's supposed to be a slow control path.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> Signed-off-by: David Wei <dw@davidwei.uk>

[...]

> diff --git a/net/core/dev.c b/net/core/dev.c
> index 1e740faf9e78..ba2f43cf5517 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6497,6 +6497,59 @@ void napi_busy_loop(unsigned int napi_id,
>  }
>  EXPORT_SYMBOL(napi_busy_loop);
>  
> +void napi_execute(unsigned napi_id,
> +		  void (*cb)(void *), void *cb_arg)
> +{
> +	struct napi_struct *napi;
> +	bool done = false;
> +	unsigned long val;
> +	void *have_poll_lock = NULL;
> +
> +	rcu_read_lock();
> +
> +	napi = napi_by_id(napi_id);
> +	if (!napi) {
> +		rcu_read_unlock();
> +		return;
> +	}
> +
> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> +		preempt_disable();
> +	for (;;) {
> +		local_bh_disable();
> +		val = READ_ONCE(napi->state);
> +
> +		/* If multiple threads are competing for this napi,
> +		* we avoid dirtying napi->state as much as we can.
> +		*/
> +		if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
> +			  NAPIF_STATE_IN_BUSY_POLL))
> +			goto restart;
> +
> +		if (cmpxchg(&napi->state, val,
> +			   val | NAPIF_STATE_IN_BUSY_POLL |
> +				 NAPIF_STATE_SCHED) != val)
> +			goto restart;
> +
> +		have_poll_lock = netpoll_poll_lock(napi);
> +		cb(cb_arg);

A lot of the above code seems quite similar to __napi_busy_loop, as
you mentioned.

It might be too painful, but I can't help but wonder if there's a
way to refactor this to use common helpers or something?

I had been thinking that the napi->state check /
cmpxchg could maybe be refactored to avoid being repeated in both
places?
Pavel Begunkov Oct. 9, 2024, 3:09 p.m. UTC | #2
On 10/8/24 23:25, Joe Damato wrote:
> On Mon, Oct 07, 2024 at 03:15:56PM -0700, David Wei wrote:
>> From: Pavel Begunkov <asml.silence@gmail.com>
> 
> [...]
> 
>> However, from time to time we need to synchronise with the napi, for
>> example to add more user memory or allocate fallback buffers. Add a
>> helper function napi_execute that allows to run a custom callback from
>> under napi context so that it can access and modify napi protected
>> parts of io_uring. It works similar to busy polling and stops napi from
>> running in the meantime, so it's supposed to be a slow control path.
>>
>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>> Signed-off-by: David Wei <dw@davidwei.uk>
> 
> [...]
> 
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 1e740faf9e78..ba2f43cf5517 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -6497,6 +6497,59 @@ void napi_busy_loop(unsigned int napi_id,
>>   }
>>   EXPORT_SYMBOL(napi_busy_loop);
>>   
>> +void napi_execute(unsigned napi_id,
>> +		  void (*cb)(void *), void *cb_arg)
>> +{
>> +	struct napi_struct *napi;
>> +	bool done = false;
>> +	unsigned long val;
>> +	void *have_poll_lock = NULL;
>> +
>> +	rcu_read_lock();
>> +
>> +	napi = napi_by_id(napi_id);
>> +	if (!napi) {
>> +		rcu_read_unlock();
>> +		return;
>> +	}
>> +
>> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>> +		preempt_disable();
>> +	for (;;) {
>> +		local_bh_disable();
>> +		val = READ_ONCE(napi->state);
>> +
>> +		/* If multiple threads are competing for this napi,
>> +		* we avoid dirtying napi->state as much as we can.
>> +		*/
>> +		if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
>> +			  NAPIF_STATE_IN_BUSY_POLL))
>> +			goto restart;
>> +
>> +		if (cmpxchg(&napi->state, val,
>> +			   val | NAPIF_STATE_IN_BUSY_POLL |
>> +				 NAPIF_STATE_SCHED) != val)
>> +			goto restart;
>> +
>> +		have_poll_lock = netpoll_poll_lock(napi);
>> +		cb(cb_arg);
> 
> A lot of the above code seems quite similar to __napi_busy_loop, as
> you mentioned.
> 
> It might be too painful, but I can't help but wonder if there's a
> way to refactor this to use common helpers or something?
> 
> I had been thinking that the napi->state check /
> cmpxchg could maybe be refactored to avoid being repeated in both
> places?

Yep, I can add a helper for that, but I'm not sure how to
deduplicate it further while trying not to pollute the
napi polling path.
Joe Damato Oct. 9, 2024, 4:13 p.m. UTC | #3
On Wed, Oct 09, 2024 at 04:09:53PM +0100, Pavel Begunkov wrote:
> On 10/8/24 23:25, Joe Damato wrote:
> > On Mon, Oct 07, 2024 at 03:15:56PM -0700, David Wei wrote:
> > > From: Pavel Begunkov <asml.silence@gmail.com>
> > 
> > [...]
> > 
> > > However, from time to time we need to synchronise with the napi, for
> > > example to add more user memory or allocate fallback buffers. Add a
> > > helper function napi_execute that allows to run a custom callback from
> > > under napi context so that it can access and modify napi protected
> > > parts of io_uring. It works similar to busy polling and stops napi from
> > > running in the meantime, so it's supposed to be a slow control path.
> > > 
> > > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> > > Signed-off-by: David Wei <dw@davidwei.uk>
> > 
> > [...]
> > 
> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index 1e740faf9e78..ba2f43cf5517 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -6497,6 +6497,59 @@ void napi_busy_loop(unsigned int napi_id,
> > >   }
> > >   EXPORT_SYMBOL(napi_busy_loop);
> > > +void napi_execute(unsigned napi_id,
> > > +		  void (*cb)(void *), void *cb_arg)
> > > +{
> > > +	struct napi_struct *napi;
> > > +	bool done = false;
> > > +	unsigned long val;
> > > +	void *have_poll_lock = NULL;
> > > +
> > > +	rcu_read_lock();
> > > +
> > > +	napi = napi_by_id(napi_id);
> > > +	if (!napi) {
> > > +		rcu_read_unlock();
> > > +		return;
> > > +	}
> > > +
> > > +	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> > > +		preempt_disable();
> > > +	for (;;) {
> > > +		local_bh_disable();
> > > +		val = READ_ONCE(napi->state);
> > > +
> > > +		/* If multiple threads are competing for this napi,
> > > +		* we avoid dirtying napi->state as much as we can.
> > > +		*/
> > > +		if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
> > > +			  NAPIF_STATE_IN_BUSY_POLL))
> > > +			goto restart;
> > > +
> > > +		if (cmpxchg(&napi->state, val,
> > > +			   val | NAPIF_STATE_IN_BUSY_POLL |
> > > +				 NAPIF_STATE_SCHED) != val)
> > > +			goto restart;
> > > +
> > > +		have_poll_lock = netpoll_poll_lock(napi);
> > > +		cb(cb_arg);
> > 
> > A lot of the above code seems quite similar to __napi_busy_loop, as
> > you mentioned.
> > 
> > It might be too painful, but I can't help but wonder if there's a
> > way to refactor this to use common helpers or something?
> > 
> > I had been thinking that the napi->state check /
> > cmpxchg could maybe be refactored to avoid being repeated in both
> > places?
> 
> Yep, I can add a helper for that, but I'm not sure how to
> deduplicate it further while trying not to pollute the
> napi polling path.

It was just a minor nit; I wouldn't want to hold back this important
work just for that.

I'm still looking at the code myself to see if I can see a better
arrangement of the code.

But that could always come later as a cleanup for -next ?
Pavel Begunkov Oct. 9, 2024, 7:12 p.m. UTC | #4
On 10/9/24 17:13, Joe Damato wrote:
> On Wed, Oct 09, 2024 at 04:09:53PM +0100, Pavel Begunkov wrote:
>> On 10/8/24 23:25, Joe Damato wrote:
>>> On Mon, Oct 07, 2024 at 03:15:56PM -0700, David Wei wrote:
>>>> From: Pavel Begunkov <asml.silence@gmail.com>
>>>
>>> [...]
>>>
>>>> However, from time to time we need to synchronise with the napi, for
>>>> example to add more user memory or allocate fallback buffers. Add a
>>>> helper function napi_execute that allows to run a custom callback from
>>>> under napi context so that it can access and modify napi protected
>>>> parts of io_uring. It works similar to busy polling and stops napi from
>>>> running in the meantime, so it's supposed to be a slow control path.
>>>>
>>>> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
>>>> Signed-off-by: David Wei <dw@davidwei.uk>
>>>
>>> [...]
>>>
>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>> index 1e740faf9e78..ba2f43cf5517 100644
>>>> --- a/net/core/dev.c
>>>> +++ b/net/core/dev.c
>>>> @@ -6497,6 +6497,59 @@ void napi_busy_loop(unsigned int napi_id,
>>>>    }
>>>>    EXPORT_SYMBOL(napi_busy_loop);
>>>> +void napi_execute(unsigned napi_id,
>>>> +		  void (*cb)(void *), void *cb_arg)
>>>> +{
>>>> +	struct napi_struct *napi;
>>>> +	bool done = false;
>>>> +	unsigned long val;
>>>> +	void *have_poll_lock = NULL;
>>>> +
>>>> +	rcu_read_lock();
>>>> +
>>>> +	napi = napi_by_id(napi_id);
>>>> +	if (!napi) {
>>>> +		rcu_read_unlock();
>>>> +		return;
>>>> +	}
>>>> +
>>>> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>>>> +		preempt_disable();
>>>> +	for (;;) {
>>>> +		local_bh_disable();
>>>> +		val = READ_ONCE(napi->state);
>>>> +
>>>> +		/* If multiple threads are competing for this napi,
>>>> +		* we avoid dirtying napi->state as much as we can.
>>>> +		*/
>>>> +		if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
>>>> +			  NAPIF_STATE_IN_BUSY_POLL))
>>>> +			goto restart;
>>>> +
>>>> +		if (cmpxchg(&napi->state, val,
>>>> +			   val | NAPIF_STATE_IN_BUSY_POLL |
>>>> +				 NAPIF_STATE_SCHED) != val)
>>>> +			goto restart;
>>>> +
>>>> +		have_poll_lock = netpoll_poll_lock(napi);
>>>> +		cb(cb_arg);
>>>
>>> A lot of the above code seems quite similar to __napi_busy_loop, as
>>> you mentioned.
>>>
>>> It might be too painful, but I can't help but wonder if there's a
>>> way to refactor this to use common helpers or something?
>>>
>>> I had been thinking that the napi->state check /
>>> cmpxchg could maybe be refactored to avoid being repeated in both
>>> places?
>>
>> Yep, I can add a helper for that, but I'm not sure how to
>> deduplicate it further while trying not to pollute the
>> napi polling path.
> 
> It was just a minor nit; I wouldn't want to hold back this important
> work just for that.
> 
> I'm still looking at the code myself to see if I can see a better
> arrangement of the code.
> 
> But that could always come later as a cleanup for -next ?

It's still early, there will be a v6 anyway. And thanks for
taking a look.
diff mbox series

Patch

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index f03040baaefd..3fd9e65731e9 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -47,6 +47,7 @@  bool sk_busy_loop_end(void *p, unsigned long start_time);
 void napi_busy_loop(unsigned int napi_id,
 		    bool (*loop_end)(void *, unsigned long),
 		    void *loop_end_arg, bool prefer_busy_poll, u16 budget);
+void napi_execute(unsigned napi_id, void (*cb)(void *), void *cb_arg);
 
 void napi_busy_loop_rcu(unsigned int napi_id,
 			bool (*loop_end)(void *, unsigned long),
@@ -63,6 +64,11 @@  static inline bool sk_can_busy_loop(struct sock *sk)
 	return false;
 }
 
+static inline void napi_execute(unsigned napi_id,
+				void (*cb)(void *), void *cb_arg)
+{
+}
+
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
 static inline unsigned long busy_loop_current_time(void)
diff --git a/net/core/dev.c b/net/core/dev.c
index 1e740faf9e78..ba2f43cf5517 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6497,6 +6497,59 @@  void napi_busy_loop(unsigned int napi_id,
 }
 EXPORT_SYMBOL(napi_busy_loop);
 
+void napi_execute(unsigned napi_id,
+		  void (*cb)(void *), void *cb_arg)
+{
+	struct napi_struct *napi;
+	bool done = false;
+	unsigned long val;
+	void *have_poll_lock = NULL;
+
+	rcu_read_lock();
+
+	napi = napi_by_id(napi_id);
+	if (!napi) {
+		rcu_read_unlock();
+		return;
+	}
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
+	for (;;) {
+		local_bh_disable();
+		val = READ_ONCE(napi->state);
+
+		/* If multiple threads are competing for this napi,
+		* we avoid dirtying napi->state as much as we can.
+		*/
+		if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+			  NAPIF_STATE_IN_BUSY_POLL))
+			goto restart;
+
+		if (cmpxchg(&napi->state, val,
+			   val | NAPIF_STATE_IN_BUSY_POLL |
+				 NAPIF_STATE_SCHED) != val)
+			goto restart;
+
+		have_poll_lock = netpoll_poll_lock(napi);
+		cb(cb_arg);
+		done = true;
+		gro_normal_list(napi);
+		local_bh_enable();
+		break;
+restart:
+		local_bh_enable();
+		if (unlikely(need_resched()))
+			break;
+		cpu_relax();
+	}
+	if (done)
+		busy_poll_stop(napi, have_poll_lock, false, 1);
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable();
+	rcu_read_unlock();
+}
+
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
 static void napi_hash_add(struct napi_struct *napi)