diff mbox series

eventfd: support delayed wakeup for non-semaphore eventfd to reduce cpu utilization

Message ID tencent_AF886EF226FD9F39D28FE4D9A94A95FA2605@qq.com (mailing list archive)
State Mainlined, archived
Headers show
Series eventfd: support delayed wakeup for non-semaphore eventfd to reduce cpu utilization | expand

Commit Message

Wen Yang April 16, 2023, 11:31 a.m. UTC
From: Wen Yang <wenyang.linux@foxmail.com>

For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
then a read(2) returns 8 bytes containing that value, and the counter's
value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
N event_writes vs ONE event_read is possible.

However, the current implementation wakes up the read thread immediately
in eventfd_write so that the cpu utilization increases unnecessarily.

By adding a configurable delay after eventfd_write, these unnecessary
wakeup operations are avoided, thereby reducing cpu utilization.

We used the following test code:

 #include <assert.h>
 #include <errno.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <string.h>
 #include <poll.h>
 #include <sys/eventfd.h>
 #include <sys/prctl.h>

void publish(int fd)
{
	unsigned long long i = 0;
	int ret;

	prctl(PR_SET_NAME,"publish");
	while (1) {
		i++;
		ret = write(fd, &i, sizeof(i));
		if (ret < 0)
			printf("XXX: write error: %s\n", strerror(errno));
	}
}

void subscribe(int fd)
{
	unsigned long long i = 0;
	struct pollfd pfds[1];
	int ret;

	prctl(PR_SET_NAME,"subscribe");
	pfds[0].fd = fd;
	pfds[0].events = POLLIN;

	usleep(10);
	while(1) {
		ret = poll(pfds, 1, -1);
		if (ret == -1)
			printf("XXX: poll error: %s\n", strerror(errno));
		if(pfds[0].revents & POLLIN)
			read(fd, &i, sizeof(i));
	}
}

int main(int argc, char *argv[])
{
	pid_t pid;
	int fd;

	fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK | EFD_NONBLOCK);
	assert(fd);

	pid = fork();
	if (pid == 0)
		subscribe(fd);
	else if (pid > 0)
		publish(fd);
	else {
		printf("XXX: fork error!\n");
		return -1;
	}

	return 0;
}

 # taskset -c 2-3 ./a.out

The original cpu usage is as follows:
07:02:55 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:02:57 PM  all   16.43    0.00   16.28    0.16    0.00    0.00    0.00    0.00    0.00   67.14
07:02:57 PM    0    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:02:57 PM    1    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:02:57 PM    2   29.21    0.00   34.83    1.12    0.00    0.00    0.00    0.00    0.00   34.83
07:02:57 PM    3   51.97    0.00   48.03    0.00    0.00    0.00    0.00    0.00    0.00    0.00

07:02:57 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:02:59 PM  all   18.75    0.00   17.47    2.56    0.00    0.32    0.00    0.00    0.00   60.90
07:02:59 PM    0    6.88    0.00    1.59    5.82    0.00    0.00    0.00    0.00    0.00   85.71
07:02:59 PM    1    1.04    0.00    1.04    2.59    0.00    0.00    0.00    0.00    0.00   95.34
07:02:59 PM    2   26.09    0.00   35.87    0.00    0.00    1.09    0.00    0.00    0.00   36.96
07:02:59 PM    3   52.00    0.00   47.33    0.00    0.00    0.67    0.00    0.00    0.00    0.00

07:02:59 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:03:01 PM  all   16.15    0.00   16.77    0.00    0.00    0.00    0.00    0.00    0.00   67.08
07:03:01 PM    0    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:03:01 PM    1    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:03:01 PM    2   27.47    0.00   36.26    0.00    0.00    0.00    0.00    0.00    0.00   36.26
07:03:01 PM    3   51.30    0.00   48.70    0.00    0.00    0.00    0.00    0.00    0.00    0.00

Then settinga the new control parameter, as follows:
echo 5 > /proc/sys/fs/eventfd_wakeup_delay_msec

The cpu usagen was observed to decrease by more than 20% (cpu #2, 26% -> 0.x%),  as follows:

07:03:01 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:03:03 PM  all   10.31    0.00    8.36    0.00    0.00    0.00    0.00    0.00    0.00   81.34
07:03:03 PM    0    0.00    0.00    1.01    0.00    0.00    0.00    0.00    0.00    0.00   98.99
07:03:03 PM    1    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:03:03 PM    2    0.52    0.00    1.05    0.00    0.00    0.00    0.00    0.00    0.00   98.43
07:03:03 PM    3   56.59    0.00   43.41    0.00    0.00    0.00    0.00    0.00    0.00    0.00

07:03:03 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:03:05 PM  all   10.61    0.00    7.82    0.00    0.00    0.00    0.00    0.00    0.00   81.56
07:03:05 PM    0    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
07:03:05 PM    1    0.00    0.00    1.01    0.00    0.00    0.00    0.00    0.00    0.00   98.99
07:03:05 PM    2    0.53    0.00    0.53    0.00    0.00    0.00    0.00    0.00    0.00   98.94
07:03:05 PM    3   58.59    0.00   41.41    0.00    0.00    0.00    0.00    0.00    0.00    0.00

07:03:05 PM  CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
07:03:07 PM  all    8.99    0.00    7.25    0.72    0.00    0.00    0.00    0.00    0.00   83.04
07:03:07 PM    0    0.00    0.00    1.52    2.53    0.00    0.00    0.00    0.00    0.00   95.96
07:03:07 PM    1    0.00    0.00    0.50    0.00    0.00    0.00    0.00    0.00    0.00   99.50
07:03:07 PM    2    0.54    0.00    0.54    0.00    0.00    0.00    0.00    0.00    0.00   98.92
07:03:07 PM    3   57.55    0.00   42.45    0.00    0.00    0.00    0.00    0.00    0.00    0.00

Signed-off-by: Wen Yang <wenyang.linux@foxmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dylan Yudaken <dylany@fb.com>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Fu Wei <wefu@redhat.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 Documentation/admin-guide/sysctl/fs.rst | 13 +++++
 fs/eventfd.c                            | 78 ++++++++++++++++++++++++-
 init/Kconfig                            | 19 ++++++
 3 files changed, 109 insertions(+), 1 deletion(-)

Comments

Bagas Sanjaya April 17, 2023, 8:22 a.m. UTC | #1
On Sun, Apr 16, 2023 at 07:31:55PM +0800, wenyang.linux@foxmail.com wrote:
> +eventfd_wakeup_delay_msec
> +------------------

Please match the section underline length as the section text above.

> +Frequent writing of an eventfd can also lead to frequent wakeup of the peer
> +read process, resulting in significant cpu overhead.
> +How ever for the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
> +then a read(2) returns 8 bytes containing that value, and the counter's value

reading eventfd?

> +is reset to zero.
> +So it coule be optimized as follows: N event_writes vs ONE event_read.
> +By adding a configurable delay after eventfd_write, these unnecessary wakeup
> +operations are avoided.

What is the connection from optimization you described to eventfd_write
delay?

Thanks.
kernel test robot April 17, 2023, 12:44 p.m. UTC | #2
Hi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on vfs-idmapping/for-next]
[also build test WARNING on linus/master v6.3-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/wenyang-linux-foxmail-com/eventfd-support-delayed-wakeup-for-non-semaphore-eventfd-to-reduce-cpu-utilization/20230416-193353
base:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping.git for-next
patch link:    https://lore.kernel.org/r/tencent_AF886EF226FD9F39D28FE4D9A94A95FA2605%40qq.com
patch subject: [PATCH] eventfd: support delayed wakeup for non-semaphore eventfd to reduce cpu utilization
reproduce:
        # https://github.com/intel-lab-lkp/linux/commit/ea9214e265bae223a795f144d6ddcac65e8e2084
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review wenyang-linux-foxmail-com/eventfd-support-delayed-wakeup-for-non-semaphore-eventfd-to-reduce-cpu-utilization/20230416-193353
        git checkout ea9214e265bae223a795f144d6ddcac65e8e2084
        make menuconfig
        # enable CONFIG_COMPILE_TEST, CONFIG_WARN_MISSING_DOCUMENTS, CONFIG_WARN_ABI_ERRORS
        make htmldocs

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202304172058.piI49JCE-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> Documentation/admin-guide/sysctl/fs.rst:74: WARNING: Title underline too short.

vim +74 Documentation/admin-guide/sysctl/fs.rst

    72	
    73	eventfd_wakeup_delay_msec
  > 74	------------------
    75	Frequent writing of an eventfd can also lead to frequent wakeup of the peer
    76	read process, resulting in significant cpu overhead.
    77	How ever for the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
    78	then a read(2) returns 8 bytes containing that value, and the counter's value
    79	is reset to zero.
    80	So it coule be optimized as follows: N event_writes vs ONE event_read.
    81	By adding a configurable delay after eventfd_write, these unnecessary wakeup
    82	operations are avoided.
    83	The max value is 100 ms.
    84
Jens Axboe April 17, 2023, 2:38 p.m. UTC | #3
On 4/16/23 5:31?AM, wenyang.linux@foxmail.com wrote:
> From: Wen Yang <wenyang.linux@foxmail.com>
> 
> For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
> then a read(2) returns 8 bytes containing that value, and the counter's
> value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
> N event_writes vs ONE event_read is possible.
> 
> However, the current implementation wakes up the read thread immediately
> in eventfd_write so that the cpu utilization increases unnecessarily.
> 
> By adding a configurable delay after eventfd_write, these unnecessary
> wakeup operations are avoided, thereby reducing cpu utilization.

What's the real world use case of this, and what would the expected
delay be there? With using a delayed work item for this, there's
certainly a pretty wide grey zone in terms of delay where this would
perform considerably worse than not doing any delayed wakeups at all.
Wen Yang April 17, 2023, 4:32 p.m. UTC | #4
在 2023/4/17 22:38, Jens Axboe 写道:
> On 4/16/23 5:31?AM, wenyang.linux@foxmail.com wrote:
>> From: Wen Yang <wenyang.linux@foxmail.com>
>>
>> For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
>> then a read(2) returns 8 bytes containing that value, and the counter's
>> value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
>> N event_writes vs ONE event_read is possible.
>>
>> However, the current implementation wakes up the read thread immediately
>> in eventfd_write so that the cpu utilization increases unnecessarily.
>>
>> By adding a configurable delay after eventfd_write, these unnecessary
>> wakeup operations are avoided, thereby reducing cpu utilization.
> What's the real world use case of this, and what would the expected
> delay be there? With using a delayed work item for this, there's
> certainly a pretty wide grey zone in terms of delay where this would
> perform considerably worse than not doing any delayed wakeups at all.


Thanks for your comments.

We have found that the CPU usage of the message middleware is high in our
environment, because sensor messages from MCU are very frequent and 
constantly
reported, possibly several hundred thousand times per second. As a result,
the message receiving thread is frequently awakened to process short 
messages.

The following is the simplified test code:
https://github.com/w-simon/tests/blob/master/src/test.c

And the test code in this patch is further simplified.

Finally, only a configuration item has been added here, allowing users 
to make
more choices.


--

Best wishes,

Wen
Jens Axboe April 19, 2023, 2:15 a.m. UTC | #5
On 4/17/23 10:32?AM, Wen Yang wrote:
> 
> ? 2023/4/17 22:38, Jens Axboe ??:
>> On 4/16/23 5:31?AM, wenyang.linux@foxmail.com wrote:
>>> From: Wen Yang <wenyang.linux@foxmail.com>
>>>
>>> For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
>>> then a read(2) returns 8 bytes containing that value, and the counter's
>>> value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
>>> N event_writes vs ONE event_read is possible.
>>>
>>> However, the current implementation wakes up the read thread immediately
>>> in eventfd_write so that the cpu utilization increases unnecessarily.
>>>
>>> By adding a configurable delay after eventfd_write, these unnecessary
>>> wakeup operations are avoided, thereby reducing cpu utilization.
>> What's the real world use case of this, and what would the expected
>> delay be there? With using a delayed work item for this, there's
>> certainly a pretty wide grey zone in terms of delay where this would
>> perform considerably worse than not doing any delayed wakeups at all.
> 
> 
> Thanks for your comments.
> 
> We have found that the CPU usage of the message middleware is high in
> our environment, because sensor messages from MCU are very frequent
> and constantly reported, possibly several hundred thousand times per
> second. As a result, the message receiving thread is frequently
> awakened to process short messages.
> 
> The following is the simplified test code:
> https://github.com/w-simon/tests/blob/master/src/test.c
> 
> And the test code in this patch is further simplified.
> 
> Finally, only a configuration item has been added here, allowing users
> to make more choices.

I think you'd have a higher chance of getting this in if the delay
setting was per eventfd context, rather than a global thing.
Christian Brauner April 19, 2023, 9:12 a.m. UTC | #6
On Tue, Apr 18, 2023 at 08:15:03PM -0600, Jens Axboe wrote:
> On 4/17/23 10:32?AM, Wen Yang wrote:
> > 
> > ? 2023/4/17 22:38, Jens Axboe ??:
> >> On 4/16/23 5:31?AM, wenyang.linux@foxmail.com wrote:
> >>> From: Wen Yang <wenyang.linux@foxmail.com>
> >>>
> >>> For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
> >>> then a read(2) returns 8 bytes containing that value, and the counter's
> >>> value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
> >>> N event_writes vs ONE event_read is possible.
> >>>
> >>> However, the current implementation wakes up the read thread immediately
> >>> in eventfd_write so that the cpu utilization increases unnecessarily.
> >>>
> >>> By adding a configurable delay after eventfd_write, these unnecessary
> >>> wakeup operations are avoided, thereby reducing cpu utilization.
> >> What's the real world use case of this, and what would the expected
> >> delay be there? With using a delayed work item for this, there's
> >> certainly a pretty wide grey zone in terms of delay where this would
> >> perform considerably worse than not doing any delayed wakeups at all.
> > 
> > 
> > Thanks for your comments.
> > 
> > We have found that the CPU usage of the message middleware is high in
> > our environment, because sensor messages from MCU are very frequent
> > and constantly reported, possibly several hundred thousand times per
> > second. As a result, the message receiving thread is frequently
> > awakened to process short messages.
> > 
> > The following is the simplified test code:
> > https://github.com/w-simon/tests/blob/master/src/test.c
> > 
> > And the test code in this patch is further simplified.
> > 
> > Finally, only a configuration item has been added here, allowing users
> > to make more choices.
> 
> I think you'd have a higher chance of getting this in if the delay
> setting was per eventfd context, rather than a global thing.

That patch seems really weird. Is that an established paradigm to
address problems like this through a configured wakeup delay? Because
naively this looks like a pretty brutal hack.
Wen Yang April 19, 2023, 3:23 p.m. UTC | #7
在 2023/4/19 17:12, Christian Brauner 写道:
> On Tue, Apr 18, 2023 at 08:15:03PM -0600, Jens Axboe wrote:
>> On 4/17/23 10:32?AM, Wen Yang wrote:
>>> ? 2023/4/17 22:38, Jens Axboe ??:
>>>> On 4/16/23 5:31?AM, wenyang.linux@foxmail.com wrote:
>>>>> From: Wen Yang <wenyang.linux@foxmail.com>
>>>>>
>>>>> For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
>>>>> then a read(2) returns 8 bytes containing that value, and the counter's
>>>>> value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
>>>>> N event_writes vs ONE event_read is possible.
>>>>>
>>>>> However, the current implementation wakes up the read thread immediately
>>>>> in eventfd_write so that the cpu utilization increases unnecessarily.
>>>>>
>>>>> By adding a configurable delay after eventfd_write, these unnecessary
>>>>> wakeup operations are avoided, thereby reducing cpu utilization.
>>>> What's the real world use case of this, and what would the expected
>>>> delay be there? With using a delayed work item for this, there's
>>>> certainly a pretty wide grey zone in terms of delay where this would
>>>> perform considerably worse than not doing any delayed wakeups at all.
>>>
>>> Thanks for your comments.
>>>
>>> We have found that the CPU usage of the message middleware is high in
>>> our environment, because sensor messages from MCU are very frequent
>>> and constantly reported, possibly several hundred thousand times per
>>> second. As a result, the message receiving thread is frequently
>>> awakened to process short messages.
>>>
>>> The following is the simplified test code:
>>> https://github.com/w-simon/tests/blob/master/src/test.c
>>>
>>> And the test code in this patch is further simplified.
>>>
>>> Finally, only a configuration item has been added here, allowing users
>>> to make more choices.
>> I think you'd have a higher chance of getting this in if the delay
>> setting was per eventfd context, rather than a global thing.

Thank you.
We will follow your suggestion to change the global configuration to per eventfd.

> That patch seems really weird. Is that an established paradigm to
> address problems like this through a configured wakeup delay? Because
> naively this looks like a pretty brutal hack.

Thanks.

Well, what you are concerned about may be that the rough delay may cause 
additional problems, which is indeed worth considering.

Meanwhile, prolonged and frequent write_eventfd calls are actually 
another type of attack.

If we change it to this:

When a continuous write_eventfd reaches a certain threshold in a short 
period of time, a delay is added as a penalty.

Do you think this is acceptable?


--

Best wishes,

Wen
Jens Axboe April 19, 2023, 4:42 p.m. UTC | #8
On 4/19/23 3:12?AM, Christian Brauner wrote:
> On Tue, Apr 18, 2023 at 08:15:03PM -0600, Jens Axboe wrote:
>> On 4/17/23 10:32?AM, Wen Yang wrote:
>>>
>>> ? 2023/4/17 22:38, Jens Axboe ??:
>>>> On 4/16/23 5:31?AM, wenyang.linux@foxmail.com wrote:
>>>>> From: Wen Yang <wenyang.linux@foxmail.com>
>>>>>
>>>>> For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
>>>>> then a read(2) returns 8 bytes containing that value, and the counter's
>>>>> value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
>>>>> N event_writes vs ONE event_read is possible.
>>>>>
>>>>> However, the current implementation wakes up the read thread immediately
>>>>> in eventfd_write so that the cpu utilization increases unnecessarily.
>>>>>
>>>>> By adding a configurable delay after eventfd_write, these unnecessary
>>>>> wakeup operations are avoided, thereby reducing cpu utilization.
>>>> What's the real world use case of this, and what would the expected
>>>> delay be there? With using a delayed work item for this, there's
>>>> certainly a pretty wide grey zone in terms of delay where this would
>>>> perform considerably worse than not doing any delayed wakeups at all.
>>>
>>>
>>> Thanks for your comments.
>>>
>>> We have found that the CPU usage of the message middleware is high in
>>> our environment, because sensor messages from MCU are very frequent
>>> and constantly reported, possibly several hundred thousand times per
>>> second. As a result, the message receiving thread is frequently
>>> awakened to process short messages.
>>>
>>> The following is the simplified test code:
>>> https://github.com/w-simon/tests/blob/master/src/test.c
>>>
>>> And the test code in this patch is further simplified.
>>>
>>> Finally, only a configuration item has been added here, allowing users
>>> to make more choices.
>>
>> I think you'd have a higher chance of getting this in if the delay
>> setting was per eventfd context, rather than a global thing.
> 
> That patch seems really weird. Is that an established paradigm to
> address problems like this through a configured wakeup delay? Because
> naively this looks like a pretty brutal hack.

It is odd, and it is a brutal hack. My worries were outlined in an
earlier reply, there's quite a big gap where no delay would be better
and the delay approach would be miserable because it'd cause extra
latency and extra context switches. It'd be much cleaner if you KNEW
there'd be more events coming, as you could then get rid of that delayed
work item completely. And I suspect, if this patch makes sense, that
it'd be better to have a number+time limit as well and if you hit the
event number count that you'd notify inline and put some smarts in the
delayed work handling to just not do anything if nothing is pending.
Wen Yang April 20, 2023, 5:44 p.m. UTC | #9
在 2023/4/20 00:42, Jens Axboe 写道:
> On 4/19/23 3:12?AM, Christian Brauner wrote:
>> On Tue, Apr 18, 2023 at 08:15:03PM -0600, Jens Axboe wrote:
>>> On 4/17/23 10:32?AM, Wen Yang wrote:
>>>> ? 2023/4/17 22:38, Jens Axboe ??:
>>>>> On 4/16/23 5:31?AM, wenyang.linux@foxmail.com wrote:
>>>>>> From: Wen Yang <wenyang.linux@foxmail.com>
>>>>>>
>>>>>> For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
>>>>>> then a read(2) returns 8 bytes containing that value, and the counter's
>>>>>> value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
>>>>>> N event_writes vs ONE event_read is possible.
>>>>>>
>>>>>> However, the current implementation wakes up the read thread immediately
>>>>>> in eventfd_write so that the cpu utilization increases unnecessarily.
>>>>>>
>>>>>> By adding a configurable delay after eventfd_write, these unnecessary
>>>>>> wakeup operations are avoided, thereby reducing cpu utilization.
>>>>> What's the real world use case of this, and what would the expected
>>>>> delay be there? With using a delayed work item for this, there's
>>>>> certainly a pretty wide grey zone in terms of delay where this would
>>>>> perform considerably worse than not doing any delayed wakeups at all.
>>>>
>>>> Thanks for your comments.
>>>>
>>>> We have found that the CPU usage of the message middleware is high in
>>>> our environment, because sensor messages from MCU are very frequent
>>>> and constantly reported, possibly several hundred thousand times per
>>>> second. As a result, the message receiving thread is frequently
>>>> awakened to process short messages.
>>>>
>>>> The following is the simplified test code:
>>>> https://github.com/w-simon/tests/blob/master/src/test.c
>>>>
>>>> And the test code in this patch is further simplified.
>>>>
>>>> Finally, only a configuration item has been added here, allowing users
>>>> to make more choices.
>>> I think you'd have a higher chance of getting this in if the delay
>>> setting was per eventfd context, rather than a global thing.
>> That patch seems really weird. Is that an established paradigm to
>> address problems like this through a configured wakeup delay? Because
>> naively this looks like a pretty brutal hack.
> It is odd, and it is a brutal hack. My worries were outlined in an
> earlier reply, there's quite a big gap where no delay would be better
> and the delay approach would be miserable because it'd cause extra
> latency and extra context switches. It'd be much cleaner if you KNEW
> there'd be more events coming, as you could then get rid of that delayed
> work item completely. And I suspect, if this patch makes sense, that
> it'd be better to have a number+time limit as well and if you hit the
> event number count that you'd notify inline and put some smarts in the
> delayed work handling to just not do anything if nothing is pending.

Thank you very much for your suggestion.

We will improve the implementation according to your suggestion and send 
the v2 later.


--

Best wishes,

Wen
Wen Yang May 4, 2023, 4:01 p.m. UTC | #10
在 2023/4/21 01:44, Wen Yang 写道:
>
> 在 2023/4/20 00:42, Jens Axboe 写道:
>> On 4/19/23 3:12?AM, Christian Brauner wrote:
>>> On Tue, Apr 18, 2023 at 08:15:03PM -0600, Jens Axboe wrote:
>>>> On 4/17/23 10:32?AM, Wen Yang wrote:
>>>>> ? 2023/4/17 22:38, Jens Axboe ??:
>>>>>> On 4/16/23 5:31?AM, wenyang.linux@foxmail.com wrote:
>>>>>>> From: Wen Yang <wenyang.linux@foxmail.com>
>>>>>>>
>>>>>>> For the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
>>>>>>> then a read(2) returns 8 bytes containing that value, and the 
>>>>>>> counter's
>>>>>>> value is reset to zero. Therefore, in the NON SEMAPHORE scenario,
>>>>>>> N event_writes vs ONE event_read is possible.
>>>>>>>
>>>>>>> However, the current implementation wakes up the read thread 
>>>>>>> immediately
>>>>>>> in eventfd_write so that the cpu utilization increases 
>>>>>>> unnecessarily.
>>>>>>>
>>>>>>> By adding a configurable delay after eventfd_write, these 
>>>>>>> unnecessary
>>>>>>> wakeup operations are avoided, thereby reducing cpu utilization.
>>>>>> What's the real world use case of this, and what would the expected
>>>>>> delay be there? With using a delayed work item for this, there's
>>>>>> certainly a pretty wide grey zone in terms of delay where this would
>>>>>> perform considerably worse than not doing any delayed wakeups at 
>>>>>> all.
>>>>>
>>>>> Thanks for your comments.
>>>>>
>>>>> We have found that the CPU usage of the message middleware is high in
>>>>> our environment, because sensor messages from MCU are very frequent
>>>>> and constantly reported, possibly several hundred thousand times per
>>>>> second. As a result, the message receiving thread is frequently
>>>>> awakened to process short messages.
>>>>>
>>>>> The following is the simplified test code:
>>>>> https://github.com/w-simon/tests/blob/master/src/test.c
>>>>>
>>>>> And the test code in this patch is further simplified.
>>>>>
>>>>> Finally, only a configuration item has been added here, allowing 
>>>>> users
>>>>> to make more choices.
>>>> I think you'd have a higher chance of getting this in if the delay
>>>> setting was per eventfd context, rather than a global thing.
>>> That patch seems really weird. Is that an established paradigm to
>>> address problems like this through a configured wakeup delay? Because
>>> naively this looks like a pretty brutal hack.
>> It is odd, and it is a brutal hack. My worries were outlined in an
>> earlier reply, there's quite a big gap where no delay would be better
>> and the delay approach would be miserable because it'd cause extra
>> latency and extra context switches. It'd be much cleaner if you KNEW
>> there'd be more events coming, as you could then get rid of that delayed
>> work item completely. And I suspect, if this patch makes sense, that
>> it'd be better to have a number+time limit as well and if you hit the
>> event number count that you'd notify inline and put some smarts in the
>> delayed work handling to just not do anything if nothing is pending.
>
> Thank you very much for your suggestion.
>
> We will improve the implementation according to your suggestion and 
> send the v2 later.
>
>
Hi Jens, Christian,

Based on your valuable suggestions and inspiration from TCP's 
/Delayed ACK/ technology, we have reimplemented v2 and are currently 
testing it.

After several days of testing, we will send it again.

Thanks.


--

Best wishes,

Wen
diff mbox series

Patch

diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst
index a321b84eccaa..7baf702c2f72 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -70,6 +70,19 @@  negative dentries which do not map to any files. Instead,
 they help speeding up rejection of non-existing files provided
 by the users.
 
+eventfd_wakeup_delay_msec
+------------------
+Frequent writing of an eventfd can also lead to frequent wakeup of the peer
+read process, resulting in significant cpu overhead.
+How ever for the NON SEMAPHORE eventfd, if it's counter has a nonzero value,
+then a read(2) returns 8 bytes containing that value, and the counter's value
+is reset to zero.
+So it coule be optimized as follows: N event_writes vs ONE event_read.
+By adding a configurable delay after eventfd_write, these unnecessary wakeup
+operations are avoided.
+The max value is 100 ms.
+
+Default: 0
 
 file-max & file-nr
 ------------------
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 95850a13ce8d..c34fff843c48 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -41,6 +41,9 @@  struct eventfd_ctx {
 	__u64 count;
 	unsigned int flags;
 	int id;
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+	struct delayed_work dwork;
+#endif
 };
 
 __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
@@ -95,6 +98,9 @@  static void eventfd_free_ctx(struct eventfd_ctx *ctx)
 {
 	if (ctx->id >= 0)
 		ida_simple_remove(&eventfd_ida, ctx->id);
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+	flush_delayed_work(&ctx->dwork);
+#endif
 	kfree(ctx);
 }
 
@@ -256,6 +262,28 @@  static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
 	return sizeof(ucnt);
 }
 
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+
+static unsigned long eventfd_wake_delay_jiffies;
+
+static void eventfd_delayed_workfn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct eventfd_ctx *ctx = container_of(dwork, struct eventfd_ctx, dwork);
+
+	spin_lock_irq(&ctx->wqh.lock);
+	current->in_eventfd = 1;
+	if (ctx->count) {
+		/* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+		if (waitqueue_active(&ctx->wqh))
+			wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+	}
+	current->in_eventfd = 0;
+	spin_unlock_irq(&ctx->wqh.lock);
+}
+
+#endif
+
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 			     loff_t *ppos)
 {
@@ -282,8 +310,27 @@  static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	if (likely(res > 0)) {
 		ctx->count += ucnt;
 		current->in_eventfd = 1;
-		if (waitqueue_active(&ctx->wqh))
+
+		/* waitqueue_active is safe because ctx->wqh.lock is being held here. */
+		if (waitqueue_active(&ctx->wqh)) {
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+			if (ctx->flags & EFD_SEMAPHORE)
+				wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+			else {
+				unsigned long delay = eventfd_wake_delay_jiffies;
+
+				if (delay) {
+					if (!delayed_work_pending(&ctx->dwork))
+						queue_delayed_work(system_unbound_wq,
+								&ctx->dwork, delay);
+				} else
+					wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+			}
+#else
 			wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+#endif
+		}
+
 		current->in_eventfd = 0;
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
@@ -406,6 +453,9 @@  static int do_eventfd(unsigned int count, int flags)
 	ctx->count = count;
 	ctx->flags = flags;
 	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+	INIT_DELAYED_WORK(&ctx->dwork, eventfd_delayed_workfn);
+#endif
 
 	flags &= EFD_SHARED_FCNTL_FLAGS;
 	flags |= O_RDWR;
@@ -438,3 +488,29 @@  SYSCALL_DEFINE1(eventfd, unsigned int, count)
 	return do_eventfd(count, 0);
 }
 
+#ifdef CONFIG_EVENTFD_WAKEUP_DELAY
+
+static const unsigned long eventfd_wake_delay_max = HZ / 10;
+
+static struct ctl_table fs_eventfd_ctl[] = {
+	{
+		.procname      = "eventfd_wakeup_delay_msec",
+		.data          = &eventfd_wake_delay_jiffies,
+		.maxlen        = sizeof(eventfd_wake_delay_jiffies),
+		.mode          = 0644,
+		.proc_handler  = proc_doulongvec_ms_jiffies_minmax,
+		.extra1        = SYSCTL_ZERO,
+		.extra2        = (void *)&eventfd_wake_delay_max,
+	},
+	{ }
+};
+
+static int __init init_fs_eventfd_sysctls(void)
+{
+	register_sysctl_init("fs", fs_eventfd_ctl);
+	return 0;
+}
+
+fs_initcall(init_fs_eventfd_sysctls);
+
+#endif /* CONFIG_EVENTFD_WAKEUP_DELAY */
diff --git a/init/Kconfig b/init/Kconfig
index 750d41a38574..23d68bcc1f19 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1629,6 +1629,25 @@  config EVENTFD
 
 	  If unsure, say Y.
 
+if EVENTFD
+config EVENTFD_WAKEUP_DELAY
+	bool "support delayed wakeup for the non-semaphore eventfd" if EXPERT
+	default n
+	depends on SYSCTL
+	help
+	  This option enables the delayed wakeup for the non-semaphore eventfd.
+	  Frequent writing of an eventfd can also lead to frequent wakeup of
+	  the peer read process, resulting in significant cpu overhead.
+	  How ever for the NON SEMAPHORE eventfd, if it's counter has a
+	  nonzero value, then a read(2) returns 8 bytes containing that value,
+	  and the counter's value is reset to zero.
+	  By adding a configurable delay after eventfd_write, these unnecessary
+	  wakeup operations are avoided.
+
+	  If unsure, say N.
+
+endif # EVENTFD
+
 config SHMEM
 	bool "Use full shmem filesystem" if EXPERT
 	default y