diff mbox series

kernel-shark: Multi-thread the computaion of CPU graph

Message ID 20240923090912.1261220-1-libo.chen@oracle.com (mailing list archive)
State New
Headers show
Series kernel-shark: Multi-thread the computaion of CPU graph | expand

Commit Message

Libo Chen Sept. 23, 2024, 9:09 a.m. UTC
Parallelize _newCPUGraph() calls to dramatically speed up
graph rendering particularly for traces from very large systems.

OpenMP technically is a new dependency here, but it's so embedded
with GCC toolchains, as long as your GCC is not older than v4.9,
the libgomp library that comes with it will work.

Signed-off-by: Libo Chen <libo.chen@oracle.com>
---
 CMakeLists.txt     |  5 +++++
 src/KsGLWidget.cpp | 15 ++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

Comments

Yordan Karadzhov Sept. 28, 2024, 2:32 p.m. UTC | #1
Hi Libo,

I like the idea of the patch, however a bit of extra work is needed 
before merging it. See my comments in the code below.

On 9/23/24 12:09, Libo Chen wrote:
> Parallelize _newCPUGraph() calls to dramatically speed up
> graph rendering particularly for traces from very large systems.
> 
> OpenMP technically is a new dependency here, but it's so embedded
> with GCC toolchains, as long as your GCC is not older than v4.9,
> the libgomp library that comes with it will work.
> 
> Signed-off-by: Libo Chen <libo.chen@oracle.com>
> ---
>   CMakeLists.txt     |  5 +++++
>   src/KsGLWidget.cpp | 15 ++++++++++++++-
>   2 files changed, 19 insertions(+), 1 deletion(-)
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt
> index c36d757..8dd9ff9 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -84,6 +84,11 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
>   set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>   
> +find_package(OpenMP 3.2.5)
> +if (OPENMP_FOUND)
> +    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
> +    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

The "if" must be closed.

> +
>   set(CMAKE_CXX_STANDARD 17)
>   set(CMAKE_CXX_STANDARD_REQUIRED ON)
>   set(CMAKE_CXX_EXTENSIONS OFF)
> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
> index 9311d98..00a1951 100644
> --- a/src/KsGLWidget.cpp
> +++ b/src/KsGLWidget.cpp
> @@ -13,6 +13,9 @@
>   #include <GL/glut.h>
>   #include <GL/gl.h>
>   
> +// OpenMP
> +#include <omp.h>
> +
>   // KernelShark
>   #include "libkshark-plugin.h"
>   #include "KsGLWidget.hpp"
> @@ -690,10 +693,20 @@ void KsGLWidget::_makeGraphs()
>   
>   	for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
>   		sd = it.key();
> +		QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
> +		QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
> +
>   		/* Create CPU graphs according to the cpuList. */
>   		it.value()._cpuGraphs = {};
> +		omp_set_num_threads(omp_get_num_procs());

Not sure how setting the thread number works in opm, but I would guess 
that you have to do it just once. If this is the case, please move the 
the line above to the constructor. Note that _makeGraphs() gets executed 
almost every time you do something in the GUI (zoom, shift, select 
event, ...)

> +		#pragma omp parallel for
>   		for (auto const &cpu: it.value()._cpuList) {
> -			g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
> +			int idx = it.value()._cpuList.indexOf(cpu);
> +			cpuGraphs[idx] = _newCPUGraph(sd, cpu);
> +		}
> +		QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
> +		while (itCpuGraphs.hasNext()) {
> +			g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
>   			it.value()._cpuGraphs.append(g);
>   		}
>   

I wonder why you add this optimization only for the CPU graphs? Please 
do the same for the Task and Combo graphs as well.

Thanks!
Yordan
Libo Chen Sept. 28, 2024, 11:16 p.m. UTC | #2
(Resend in plain text)

Hi Yordan,

> On Sep 28, 2024, at 07:32, Yordan Karadzhov <y.karadz@gmail.com> wrote:
> 
> Hi Libo,
> 
> I like the idea of the patch, however a bit of extra work is needed before merging it. See my comments in the code below.
> 
> On 9/23/24 12:09, Libo Chen wrote:
>> Parallelize _newCPUGraph() calls to dramatically speed up
>> graph rendering particularly for traces from very large systems.
>> OpenMP technically is a new dependency here, but it's so embedded
>> with GCC toolchains, as long as your GCC is not older than v4.9,
>> the libgomp library that comes with it will work.
>> Signed-off-by: Libo Chen <libo.chen@oracle.com>
>> ---
>>  CMakeLists.txt     |  5 +++++
>>  src/KsGLWidget.cpp | 15 ++++++++++++++-
>>  2 files changed, 19 insertions(+), 1 deletion(-)
>> diff --git a/CMakeLists.txt b/CMakeLists.txt
>> index c36d757..8dd9ff9 100644
>> --- a/CMakeLists.txt
>> +++ b/CMakeLists.txt
>> @@ -84,6 +84,11 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
>>  set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>  +find_package(OpenMP 3.2.5)
>> +if (OPENMP_FOUND)
>> +    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
>> +    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
> 
> The "if" must be closed.

Whoops, will fix that 

>> +
>>  set(CMAKE_CXX_STANDARD 17)
>>  set(CMAKE_CXX_STANDARD_REQUIRED ON)
>>  set(CMAKE_CXX_EXTENSIONS OFF)
>> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
>> index 9311d98..00a1951 100644
>> --- a/src/KsGLWidget.cpp
>> +++ b/src/KsGLWidget.cpp
>> @@ -13,6 +13,9 @@
>>  #include <GL/glut.h>
>>  #include <GL/gl.h>
>>  +// OpenMP
>> +#include <omp.h>
>> +
>>  // KernelShark
>>  #include "libkshark-plugin.h"
>>  #include "KsGLWidget.hpp"
>> @@ -690,10 +693,20 @@ void KsGLWidget::_makeGraphs()
>>     for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
>>   sd = it.key();
>> + QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
>> + QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
>> +
>>   /* Create CPU graphs according to the cpuList. */
>>   it.value()._cpuGraphs = {};
>> + omp_set_num_threads(omp_get_num_procs());
> 
> Not sure how setting the thread number works in opm, but I would guess that you have to do it just once. If this is the case, please move the the line above to the constructor. Note that _makeGraphs() gets executed almost every time you do something in the GUI (zoom, shift, select event, ...)

Yeah, I did think about setting it once but the only reason I was considering not doing that is one can plug/unplug or online/offline CPUs while kernelshark is running. This is somewhat an edge case on baremetals but could be more common in VMs. I will take your advice here unless somebody in the future starts to complain about it and TBH one should probably not change the number of online CPUs while having CPU intensive workloads running. 

>> + #pragma omp parallel for
>>   for (auto const &cpu: it.value()._cpuList) {
>> - g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
>> + int idx = it.value()._cpuList.indexOf(cpu);
>> + cpuGraphs[idx] = _newCPUGraph(sd, cpu);
>> + }
>> + QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
>> + while (itCpuGraphs.hasNext()) {
>> + g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
>>   it.value()._cpuGraphs.append(g);
>>   }
>>  
> 
> I wonder why you add this optimization only for the CPU graphs? Please do the same for the Task and Combo graphs as well.

Haha, it was intentional. I was just trying to gauge interest on this optimization. I will send out a new one to include other graphs. 

Best,
Libo  

> Thanks!
> Yordan
diff mbox series

Patch

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c36d757..8dd9ff9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,11 @@  set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
 set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
 
+find_package(OpenMP 3.2.5)
+if (OPENMP_FOUND)
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
index 9311d98..00a1951 100644
--- a/src/KsGLWidget.cpp
+++ b/src/KsGLWidget.cpp
@@ -13,6 +13,9 @@ 
 #include <GL/glut.h>
 #include <GL/gl.h>
 
+// OpenMP
+#include <omp.h>
+
 // KernelShark
 #include "libkshark-plugin.h"
 #include "KsGLWidget.hpp"
@@ -690,10 +693,20 @@  void KsGLWidget::_makeGraphs()
 
 	for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
 		sd = it.key();
+		QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
+		QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
+
 		/* Create CPU graphs according to the cpuList. */
 		it.value()._cpuGraphs = {};
+		omp_set_num_threads(omp_get_num_procs());
+		#pragma omp parallel for
 		for (auto const &cpu: it.value()._cpuList) {
-			g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
+			int idx = it.value()._cpuList.indexOf(cpu);
+			cpuGraphs[idx] = _newCPUGraph(sd, cpu);
+		}
+		QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
+		while (itCpuGraphs.hasNext()) {
+			g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
 			it.value()._cpuGraphs.append(g);
 		}