diff mbox series

Testing DDR MPAM monitor (mbm_total_bytes)

Message ID MW4PR18MB50841317ED500974EF975F70C6F3A@MW4PR18MB5084.namprd18.prod.outlook.com (mailing list archive)
State New, archived
Headers show
Series Testing DDR MPAM monitor (mbm_total_bytes) | expand

Commit Message

Amit Singh Tomar Sept. 10, 2023, 6:58 p.m. UTC
Hi James,

We have two types of MSC's: 

1 ) L3 MSC with controls/features:
    "mpam_feat_cpor_part", and "mpam_feat_msmon_csu",

2) DDR MSC with controls/features:
    "mpam_feat_mbw_min", "mpam_feat_mbw_max", "mpam_feat_msmon_mbwu", "mpam_feat_msmon_mbwu_63counter", and "mpam_feat_msmon_mbwu_rwbw".

Trying to test DDR MPAM monitors (under DDR MSC mapped to mpam_feat_msmon_mbwu feature) using your latest snapshot[1] (mpam/snapshot/v6.5-rc1) but found few issues.

1) When try to mount the resource control, seeing the following stack trace:

root@localhost:~# mount -t resctrl resctrl -o this_is_not_abi /sys/fs/resctrl
mount: /sys/fs/resctrl: permission denied.
root@localhost:~# dmesg | tail -33
[   36.719569] ------------[ cut here ]------------
[   36.719571] WARNING: CPU: 23 PID: 786 at fs/resctrl/rdtgroup.c:3109 mkdir_mondata_subdir+0x214/0x228
[   36.719579] Modules linked in:
[   36.719580] CPU: 23 PID: 786 Comm: mount Not tainted 6.5.0-rc1-g9f0a8101361c-dirty #2
[   36.719582] Hardware name: Marvell SP1W5NXX board (DT)
[   36.719582] pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
[   36.719584] pc : mkdir_mondata_subdir+0x214/0x228
[   36.719586] lr : mkdir_mondata_subdir+0x90/0x228
[   36.719588] sp : ffff800082d73b00
[   36.719589] x29: ffff800082d73b10 x28: ffff00011dc1bc00 x27: 0000000000000000
[   36.719591] x26: ffffa9a19cbe5388 x25: ffffa9a19cba5880 x24: ffff000103364008
[   36.719593] x23: ffffa9a19cbe5320 x22: ffff00010675c280 x21: ffff00010675c480
[   36.719594] x20: 000000000675c480 x19: ffff00010675c280 x18: 0000000000000030
[   36.719596] x17: 0000000000000000 x16: 0000000000000000 x15: ffffffffffffffff
[   36.719598] x14: ffff800102d73bb7 x13: 0000000000000002 x12: ffff800082d73bc0
[   36.719599] x11: 0000000000000001 x10: 000000000000005f x9 : 0000000000000001
[   36.719601] x8 : 0101010101010101 x7 : ffff7f7f7fff7f7f x6 : fefdff0005c7ff2f
[   36.719603] x5 : 0000800000008000 x4 : 0000000000000006 x3 : 0000000000000000
[   36.719604] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffa9a19cbe5388
[   36.719606] Call trace:
[   36.719607]  mkdir_mondata_subdir+0x214/0x228
[   36.719609]  mkdir_mondata_all+0xb0/0x110
[   36.719612]  rdt_get_tree+0x218/0x500
[   36.719614]  vfs_get_tree+0x28/0xec
[   36.719616]  path_mount+0x3d4/0xa4c
[   36.719618]  __arm64_sys_mount+0x1d4/0x2b0
[   36.719619]  invoke_syscall+0x48/0x114
[   36.719622]  el0_svc_common.constprop.0+0x44/0xe4
[   36.719625]  do_el0_svc+0x38/0xa4
[   36.719627]  el0_svc+0x2c/0x84
[   36.719630]  el0t_64_sync_handler+0xc0/0xc4
[   36.719632]  el0t_64_sync+0x190/0x194
[   36.719633] ---[ end trace 0000000000000000 ]---

It is due to the fact the event list for MBA resource is empty [1] (event list is created only for L3 resource), and we hit here:

https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/fs/resctrl/rdtgroup.c?h=mpam/snapshot/v6.5-rc1#n3109

With the following change, managed to mount the resource control:

# git diff fs/resctrl/rdtgroup.c 
diff --git a/drivers/platform/mpam/mpam_resctrl.c b/drivers/platform/mpam/mpam_resctrl.c
index 6e9925fcf934..4d9beed0b6cd 100644

@@ -397,6 +404,13 @@ int resctrl_arch_rmid_read(struct rdt_resource     *r, struct rdt_domain *d,
        u32 mon = *(u32 *)arch_mon_ctx;
        enum mpam_device_features type;

+       pr_info("resource name is %s\n", r->name);
+       pr_info("event id is %x\n", eventid);
+       pr_info("type set is %x\n", type);
+
+       if (!strcmp(r->name, "L3"))
+               return 0;
+
@@ -740,15 +754,23 @@ static void mpam_resctrl_pick_mba(void)
 
 bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
 {
+       struct mpam_props *cprops;
+
        switch (evt) {
         case QOS_L3_MBM_LOCAL_EVENT_ID:
-               struct mpam_props *cprops;
-
                if (!mbm_local_class)
 
        dom = container_of(d, struct mpam_resctrl_dom, resctrl_dom);
@@ -740,15 +754,23 @@ static void mpam_resctrl_pick_mba(void)
 
 bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
 {
+       struct mpam_props *cprops;
+
        switch (evt) {
         case QOS_L3_MBM_LOCAL_EVENT_ID:
-               struct mpam_props *cprops;
-
                if (!mbm_local_class)
                        return false;
                cprops = &mbm_local_class->props;
 
                return mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, cprops);
+       case QOS_L3_MBM_TOTAL_EVENT_ID:
+               if (!mbm_total_class)
+                       return false;
+
+               cprops = &mbm_total_class->props;
+               pr_info("total_cprops feature is %x\n", cprops->features);
+
+               return mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, cprops);             
        default:
                return false;
        }

At the moment, not really sure how to get it working properly when DDR MPAM monitors (enumerated as part of DDR MSC) is coupled with L3 resource, .i.e.
"MBM events are also part of RDT_RESOURCE_L3 resource because as per the SDM the total and local memory bandwidth are enumerated as part of L3 monitoring"

Thanks
-Amit

P.S: 

We posted this RFC[3] earlier that demonstrates DDR MPAM monitors working (decoupled it from L3 resource).

[1]: https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/fs/resctrl/monitor.c?h=mpam/snapshot/v6.5-rc1#n799
[2]: https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/drivers/platform/mpam/mpam_devices.c?h=mpam/snapshot/v6.5-rc1#n1004
[3]: https://lore.kernel.org/linux-arm-kernel/20230308134539.3071034-2-amitsinght@marvell.com/T/

Comments

Peter Newman Sept. 12, 2023, 8:56 a.m. UTC | #1
Hi Amit,

On Sun, Sep 10, 2023 at 8:58 PM Amit Singh Tomar <amitsinght@marvell.com> wrote:
>
> Hi James,
>
> We have two types of MSC's:
>
> 1 ) L3 MSC with controls/features:
>     "mpam_feat_cpor_part", and "mpam_feat_msmon_csu",
>
> 2) DDR MSC with controls/features:
>     "mpam_feat_mbw_min", "mpam_feat_mbw_max", "mpam_feat_msmon_mbwu", "mpam_feat_msmon_mbwu_63counter", and "mpam_feat_msmon_mbwu_rwbw".
>
> Trying to test DDR MPAM monitors (under DDR MSC mapped to mpam_feat_msmon_mbwu feature) using your latest snapshot[1] (mpam/snapshot/v6.5-rc1) but found few issues.
>
> 1) When try to mount the resource control, seeing the following stack trace:
>
> root@localhost:~# mount -t resctrl resctrl -o this_is_not_abi /sys/fs/resctrl
> mount: /sys/fs/resctrl: permission denied.
> root@localhost:~# dmesg | tail -33
> [   36.719569] ------------[ cut here ]------------
> [   36.719571] WARNING: CPU: 23 PID: 786 at fs/resctrl/rdtgroup.c:3109 mkdir_mondata_subdir+0x214/0x228
> [   36.719579] Modules linked in:
> [   36.719580] CPU: 23 PID: 786 Comm: mount Not tainted 6.5.0-rc1-g9f0a8101361c-dirty #2
> [   36.719582] Hardware name: Marvell SP1W5NXX board (DT)
> [   36.719582] pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
> [   36.719584] pc : mkdir_mondata_subdir+0x214/0x228
> [   36.719586] lr : mkdir_mondata_subdir+0x90/0x228
> [   36.719588] sp : ffff800082d73b00
> [   36.719589] x29: ffff800082d73b10 x28: ffff00011dc1bc00 x27: 0000000000000000
> [   36.719591] x26: ffffa9a19cbe5388 x25: ffffa9a19cba5880 x24: ffff000103364008
> [   36.719593] x23: ffffa9a19cbe5320 x22: ffff00010675c280 x21: ffff00010675c480
> [   36.719594] x20: 000000000675c480 x19: ffff00010675c280 x18: 0000000000000030
> [   36.719596] x17: 0000000000000000 x16: 0000000000000000 x15: ffffffffffffffff
> [   36.719598] x14: ffff800102d73bb7 x13: 0000000000000002 x12: ffff800082d73bc0
> [   36.719599] x11: 0000000000000001 x10: 000000000000005f x9 : 0000000000000001
> [   36.719601] x8 : 0101010101010101 x7 : ffff7f7f7fff7f7f x6 : fefdff0005c7ff2f
> [   36.719603] x5 : 0000800000008000 x4 : 0000000000000006 x3 : 0000000000000000
> [   36.719604] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffa9a19cbe5388
> [   36.719606] Call trace:
> [   36.719607]  mkdir_mondata_subdir+0x214/0x228
> [   36.719609]  mkdir_mondata_all+0xb0/0x110
> [   36.719612]  rdt_get_tree+0x218/0x500
> [   36.719614]  vfs_get_tree+0x28/0xec
> [   36.719616]  path_mount+0x3d4/0xa4c
> [   36.719618]  __arm64_sys_mount+0x1d4/0x2b0
> [   36.719619]  invoke_syscall+0x48/0x114
> [   36.719622]  el0_svc_common.constprop.0+0x44/0xe4
> [   36.719625]  do_el0_svc+0x38/0xa4
> [   36.719627]  el0_svc+0x2c/0x84
> [   36.719630]  el0t_64_sync_handler+0xc0/0xc4
> [   36.719632]  el0t_64_sync+0x190/0x194
> [   36.719633] ---[ end trace 0000000000000000 ]---
>
> It is due to the fact the event list for MBA resource is empty [1] (event list is created only for L3 resource), and we hit here:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/fs/resctrl/rdtgroup.c?h=mpam/snapshot/v6.5-rc1#n3109
>
> With the following change, managed to mount the resource control:
>
> # git diff fs/resctrl/rdtgroup.c
> diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
> index 43efed317f1b..655757183b84 100644
> --- a/fs/resctrl/rdtgroup.c
> +++ b/fs/resctrl/rdtgroup.c
> @@ -3096,6 +3096,9 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
>         char name[32];
>         int ret;
>
> +       if (!strcmp(r->name, "MB"))
> +               return 0;
> +

Can you explain why it's a problem for the memory bandwidth allocation
(MBA) resource to not have any monitoring events? All of the
monitoring events are on the L3 resource on the existing x86
implementations and this doesn't lead to any issues.

You explained earlier that two different MSCs have monitoring
capabilities on this hardware and this crash report is suggesting that
some monitoring events should have been assigned to the MBA resource.

The common resctrl code, especially the implementation of the RMID
limbo list assumes a single monitoring resource in the system. That
is, a single resource is expected to be able to tell you the memory
bandwidth AND LLC occupancy of an RMID. That shouldn't be hard to fix,
but if the LLC occupancy and MBM domains didn't align, fixing the
limbo mechanism would become much more difficult.

Can you start by explaining why monitoring resources should be
assigned to different resources? What would have gone wrong if the DDR
MSCs' bandwidth event counts were attached to the L3 resource? At a
high level, the concept of a "resource" in resctrl seems abstract
enough that it's difficult for me to understand why the MPAM code
would choose to arrange things this way.

Thanks!
-Peter
Amit Singh Tomar Sept. 22, 2023, 7:08 p.m. UTC | #2
Hi Peter,

-----Original Message-----
From: Peter Newman <peternewman@google.com> 
Sent: Tuesday, September 12, 2023 2:26 PM
To: Amit Singh Tomar <amitsinght@marvell.com>
Cc: james.morse@arm.com; Luck, Tony <tony.luck@intel.com>; Reinette Chatre <reinette.chatre@intel.com>; jonathan.cameron@huawei.com; carl@os.amperecomputing.com; Yu, Fenghua <fenghua.yu@intel.com>; George Cherian <gcherian@marvell.com>; linux-arm-kernel@lists.infradead.org; rohit.mathew@arm.com
Subject: [EXT] Re: Testing DDR MPAM monitor (mbm_total_bytes)

External Email

----------------------------------------------------------------------
Hi Amit,

On Sun, Sep 10, 2023 at 8:58 PM Amit Singh Tomar <amitsinght@marvell.com> wrote:
>
> Hi James,
>
> We have two types of MSC's:
>
> 1 ) L3 MSC with controls/features:
>     "mpam_feat_cpor_part", and "mpam_feat_msmon_csu",
>
> 2) DDR MSC with controls/features:
>     "mpam_feat_mbw_min", "mpam_feat_mbw_max", "mpam_feat_msmon_mbwu", "mpam_feat_msmon_mbwu_63counter", and "mpam_feat_msmon_mbwu_rwbw".
>
> Trying to test DDR MPAM monitors (under DDR MSC mapped to mpam_feat_msmon_mbwu feature) using your latest snapshot[1] (mpam/snapshot/v6.5-rc1) but found few issues.
>
> 1) When try to mount the resource control, seeing the following stack trace:
>
> root@localhost:~# mount -t resctrl resctrl -o this_is_not_abi 
> /sys/fs/resctrl
> mount: /sys/fs/resctrl: permission denied.
> root@localhost:~# dmesg | tail -33
> [   36.719569] ------------[ cut here ]------------
> [   36.719571] WARNING: CPU: 23 PID: 786 at fs/resctrl/rdtgroup.c:3109 mkdir_mondata_subdir+0x214/0x228
> [   36.719579] Modules linked in:
> [   36.719580] CPU: 23 PID: 786 Comm: mount Not tainted 6.5.0-rc1-g9f0a8101361c-dirty #2
> [   36.719582] Hardware name: Marvell SP1W5NXX board (DT)
> [   36.719582] pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
> [   36.719584] pc : mkdir_mondata_subdir+0x214/0x228
> [   36.719586] lr : mkdir_mondata_subdir+0x90/0x228
> [   36.719588] sp : ffff800082d73b00
> [   36.719589] x29: ffff800082d73b10 x28: ffff00011dc1bc00 x27: 0000000000000000
> [   36.719591] x26: ffffa9a19cbe5388 x25: ffffa9a19cba5880 x24: ffff000103364008
> [   36.719593] x23: ffffa9a19cbe5320 x22: ffff00010675c280 x21: ffff00010675c480
> [   36.719594] x20: 000000000675c480 x19: ffff00010675c280 x18: 0000000000000030
> [   36.719596] x17: 0000000000000000 x16: 0000000000000000 x15: ffffffffffffffff
> [   36.719598] x14: ffff800102d73bb7 x13: 0000000000000002 x12: ffff800082d73bc0
> [   36.719599] x11: 0000000000000001 x10: 000000000000005f x9 : 0000000000000001
> [   36.719601] x8 : 0101010101010101 x7 : ffff7f7f7fff7f7f x6 : fefdff0005c7ff2f
> [   36.719603] x5 : 0000800000008000 x4 : 0000000000000006 x3 : 0000000000000000
> [   36.719604] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffa9a19cbe5388
> [   36.719606] Call trace:
> [   36.719607]  mkdir_mondata_subdir+0x214/0x228
> [   36.719609]  mkdir_mondata_all+0xb0/0x110
> [   36.719612]  rdt_get_tree+0x218/0x500
> [   36.719614]  vfs_get_tree+0x28/0xec
> [   36.719616]  path_mount+0x3d4/0xa4c
> [   36.719618]  __arm64_sys_mount+0x1d4/0x2b0
> [   36.719619]  invoke_syscall+0x48/0x114
> [   36.719622]  el0_svc_common.constprop.0+0x44/0xe4
> [   36.719625]  do_el0_svc+0x38/0xa4
> [   36.719627]  el0_svc+0x2c/0x84
> [   36.719630]  el0t_64_sync_handler+0xc0/0xc4
> [   36.719632]  el0t_64_sync+0x190/0x194
> [   36.719633] ---[ end trace 0000000000000000 ]---
>
> It is due to the fact the event list for MBA resource is empty [1] (event list is created only for L3 resource), and we hit here:
>
> https://urldefense.proofpoint.com/v2/url?u=https-3A__git.kernel.org_pu
> b_scm_linux_kernel_git_morse_linux.git_tree_fs_resctrl_rdtgroup.c-3Fh-
> 3Dmpam_snapshot_v6.5-2Drc1-23n3109&d=DwIFaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r
> =V_GK7jRuCHDErm6txmgDK1-MbUihtnSQ3gPgB-A-JKU&m=yoN0wNJCa54iyGEQiAtOQOE
> 8ehjd_eAfNVoqvPU8yuNe6ZcHmI61c4AHwj8oH9aD&s=O9tOocqCCamJFA2TLxWBei1pIu
> E8d8f7r66Dz93IKNc&e=
>
> With the following change, managed to mount the resource control:
>
> # git diff fs/resctrl/rdtgroup.c
> diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 
> 43efed317f1b..655757183b84 100644
> --- a/fs/resctrl/rdtgroup.c
> +++ b/fs/resctrl/rdtgroup.c
> @@ -3096,6 +3096,9 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
>         char name[32];
>         int ret;
>
> +       if (!strcmp(r->name, "MB"))
> +               return 0;
> +

Can you explain why it's a problem for the memory bandwidth allocation
(MBA) resource to not have any monitoring events? All of the monitoring events are on the L3 resource on the existing x86 implementations and this doesn't lead to any issues.
[>>] 
On x86, these monitoring capabilities (be it the total memory bandwidth, or LLC occupancy) are enumerated at CPU level (as CPU features), and set only for L3 resource. 
https://elixir.bootlin.com/linux/latest/source/arch/x86/kernel/cpu/resctrl/monitor.c#L828
and since only L3 resource has the monitor capability, following code path is never called for MBA resource, and there is no such issue on x86. 
https://elixir.bootlin.com/linux/latest/source/arch/x86/kernel/cpu/resctrl/rdtgroup.c#L2868
https://elixir.bootlin.com/linux/latest/source/arch/x86/kernel/cpu/resctrl/rdtgroup.c#L2970

You explained earlier that two different MSCs have monitoring capabilities on this hardware and this crash report is suggesting that some monitoring events should have been assigned to the MBA resource.
[>>] Yes, and in absence of event list (for MBA resource), this crash is triggered.

The common resctrl code, especially the implementation of the RMID limbo list assumes a single monitoring resource in the system. That is, a single resource is expected to be able to tell you the memory bandwidth AND LLC occupancy of an RMID. That shouldn't be hard to fix, but if the LLC occupancy and MBM domains didn't align, fixing the limbo mechanism would become much more difficult.

Can you start by explaining why monitoring resources should be assigned to different resources? What would have gone wrong if the DDR MSCs' bandwidth event counts were attached to the L3 resource? At a high level, the concept of a "resource" in resctrl seems abstract enough that it's difficult for me to understand why the MPAM code would choose to arrange things this way.
[>>] 
But on ARM side, monitoring  capabilities are enumerated at individual MSC level, for instance, Cache storage usage monitor (LLC occupancy counter) is enumerated 
under L3 MSC, and DDR MPAM monitor (Memory BW) is enumerated under DDR MSC (corresponds to MBA resource).

Monitor capability is set for both L3[1], and MBA[2] resource (from ARM specific resource control code).

We may get away with this (avoid that crash, I mean) by doing something similarly to what X86 resource control does. But, as mentioned earlier, if DDR BW monitor is coupled with L3 resource, there would be feature mismatch as L3 resource class only has knowledge of the feature/control enumerated under L3 MSC. Unlike x86, there is no shared resource monitoring .i.e. L3 occupancy, and Memory BW on ARM side (at-least the way ARM specific resource control code is organized at the moment).

Also, "I'd like to highlight another point, as per ARM MPAM spec, all these controls/monitors are optional, and if an implementation decide not to have L3 MSC, which effectively means there is no RDT_RESOURCE_L3 gets created (basically there is no CACHE class[3]). Then, how this whole thing would work? where it says

     "MBM events are also part of RDT_RESOURCE_L3 resource because as per the SDM the total and local memory bandwidth
      are enumerated as part of L3 monitoring".

Thanks,
-Amit

[1]: https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/drivers/platform/mpam/mpam_resctrl.c?h=mpam/snapshot/v6.5-rc1#n834
[2]: https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/drivers/platform/mpam/mpam_resctrl.c?h=mpam/snapshot/v6.5-rc1#n869
[3]: https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/include/linux/arm_mpam.h?h=mpam/snapshot/v6.5-rc1#n28
Peter Newman Sept. 26, 2023, 1:28 p.m. UTC | #3
Hi Amit,

On Fri, Sep 22, 2023 at 9:08 PM Amit Singh Tomar <amitsinght@marvell.com> wrote:
> -----Original Message-----
> From: Peter Newman <peternewman@google.com>
> Sent: Tuesday, September 12, 2023 2:26 PM
> To: Amit Singh Tomar <amitsinght@marvell.com>
> Cc: james.morse@arm.com; Luck, Tony <tony.luck@intel.com>; Reinette Chatre <reinette.chatre@intel.com>; jonathan.cameron@huawei.com; carl@os.amperecomputing.com; Yu, Fenghua <fenghua.yu@intel.com>; George Cherian <gcherian@marvell.com>; linux-arm-kernel@lists.infradead.org; rohit.mathew@arm.com
 You explained earlier that two different MSCs have monitoring
capabilities on this hardware and this crash report is suggesting that
some monitoring events should have been assigned to the MBA resource.
> [>>] Yes, and in absence of event list (for MBA resource), this crash is triggered.
>
> The common resctrl code, especially the implementation of the RMID limbo list assumes a single monitoring resource in the system. That is, a single resource is expected to be able to tell you the memory bandwidth AND LLC occupancy of an RMID. That shouldn't be hard to fix, but if the LLC occupancy and MBM domains didn't align, fixing the limbo mechanism would become much more difficult.
>
> Can you start by explaining why monitoring resources should be assigned to different resources? What would have gone wrong if the DDR MSCs' bandwidth event counts were attached to the L3 resource? At a high level, the concept of a "resource" in resctrl seems abstract enough that it's difficult for me to understand why the MPAM code would choose to arrange things this way.
> [>>]
> But on ARM side, monitoring  capabilities are enumerated at individual MSC level, for instance, Cache storage usage monitor (LLC occupancy counter) is enumerated
> under L3 MSC, and DDR MPAM monitor (Memory BW) is enumerated under DDR MSC (corresponds to MBA resource).
>
> Monitor capability is set for both L3[1], and MBA[2] resource (from ARM specific resource control code).
>
> We may get away with this (avoid that crash, I mean) by doing something similarly to what X86 resource control does. But, as mentioned earlier, if DDR BW monitor is coupled with L3 resource, there would be feature mismatch as L3 resource class only has knowledge of the feature/control enumerated under L3 MSC. Unlike x86, there is no shared resource monitoring .i.e. L3 occupancy, and Memory BW on ARM side (at-least the way ARM specific resource control code is organized at the moment).
>
> Also, "I'd like to highlight another point, as per ARM MPAM spec, all these controls/monitors are optional, and if an implementation decide not to have L3 MSC, which effectively means there is no RDT_RESOURCE_L3 gets created (basically there is no CACHE class[3]). Then, how this whole thing would work? where it says
>
>      "MBM events are also part of RDT_RESOURCE_L3 resource because as per the SDM the total and local memory bandwidth
>       are enumerated as part of L3 monitoring".
>

It looks like associating the DDR MSCs' bandwidth counters with the
MBA resource is a choice made by James's prototype[1]. I'm sure we'll
be able to discuss it more when it becomes a topic of review.

It's my opinion that a resctrl driver could choose to associate
bandwidth counters residing on a memory controller with the L3
resource as long as the memory controller and L3 caches domains' are
the same. That seems to be the only implication of "level" and "scope"
in the resctrl fs code.

-Peter

[1] https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/commit/drivers/platform/mpam/mpam_resctrl.c?h=mpam/snapshot/v6.5-rc1&id=492bbaf1e09f9fcfe2ac37ddb3e6678014044453
diff mbox series

Patch

diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
index 43efed317f1b..655757183b84 100644
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -3096,6 +3096,9 @@  static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
        char name[32];
        int ret;
 
+       if (!strcmp(r->name, "MB"))
+               return 0;
+
        sprintf(name, "mon_%s_%02d", r->name, d->id);
        /* create the directory */
        kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);


and seeing mbm_total_bytes populated under mon_data.

root@localhost:/sys/fs/resctrl# ls mon_data/mon_L3_00/
llc_occupancy  mbm_total_bytes

2). Other issue I see is, when we try to read BW monitors using perf tool

root@localhost:/sys/fs/resctrl# perf stat -e resctrl_pmu/mbm_total_bytes,resctrl_id=0x7eb0c51a2bebc62b/ bw_mem -P 80 1G bzero;
[   72.926508] Unable to handle kernel paging request at virtual address ffffffff83cac910
[   72.934432] Mem abort info:
[   72.937217]   ESR = 0x0000000096000004
[   72.940961]   EC = 0x25: DABT (current EL), IL = 32 bits
[   72.946266]   SET = 0, FnV = 0
[   72.949311]   EA = 0, S1PTW = 0
[   72.952443]   FSC = 0x04: level 0 translation fault
[   72.957313] Data abort info:
[   72.960186]   ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
[   72.965663]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[   72.970707]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[   72.976012] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000021c7f000
[   72.982706] [ffffffff83cac910] pgd=0000000000000000, p4d=0000000000000000
[   72.989489] Internal error: Oops: 0000000096000004 [#2] PREEMPT SMP
[   72.995748] Modules linked in:
[   72.998794] CPU: 72 PID: 800 Comm: perf Tainted: G      D            6.5.0-rc1-g9f0a8101361c-dirty #4
[   73.008007] Hardware name: Marvell SP1W5NXX board (DT)
[   73.013137] pstate: 83400009 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
[   73.020090] pc : resctrl_arch_mon_ctx_free+0x10/0xd8
[   73.025049] lr : resctrl_pmu_event_destroy+0x3c/0x4c
[   73.030005] sp : ffff800082e6bbb0
[   73.033310] x29: ffff800082e6bbb0 x28: ffffcc4d83949000 x27: 0000000000000000
[   73.040439] x26: ffff00011d948420 x25: ffff00011d948420 x24: ffffcc4d83d8c4c8
[   73.047569] x23: 0000000000000000 x22: 0000000000000000 x21: 00000000ffffffea
[   73.054697] x20: 0000000000000002 x19: ffff00011d948420 x18: 0000000000000000
[   73.061827] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
[   73.068955] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000040
[   73.076084] x11: ffff000100400b68 x10: ffff000100400b6a x9 : ffffcc4d83d8c4d0
[   73.083212] x8 : ffff000100400b90 x7 : 0000000000000000 x6 : ffff000100400bc8
[   73.090341] x5 : ffff00011d36bc00 x4 : ffff800082e6bb50 x3 : 0000000000000000
[   73.097469] x2 : ffffffff83cac910 x1 : 0000000000000002 x0 : ffffcc4d83de5200
[   73.104598] Call trace:
[   73.107037]  resctrl_arch_mon_ctx_free+0x10/0xd8
[   73.111646]  resctrl_pmu_event_destroy+0x3c/0x4c
[   73.116254]  perf_try_init_event+0x11c/0x138
[   73.120517]  perf_event_alloc+0x354/0xea4
[   73.124518]  __do_sys_perf_event_open+0x174/0xab0
[   73.129214]  __arm64_sys_perf_event_open+0x28/0x34
[   73.133996]  invoke_syscall+0x48/0x114
[   73.137737]  el0_svc_common.constprop.0+0x44/0xe4
[   73.142433]  do_el0_svc+0x38/0xa4
[   73.145741]  el0_svc+0x2c/0x84
[   73.148787]  el0t_64_sync_handler+0xc0/0xc4
[   73.152963]  el0t_64_sync+0x190/0x194
[   73.156617] Code: d503233f a9bd7bfd 910003fd f90013f5 (b9400055) 
[   73.162702] ---[ end trace 0000000000000000 ]---
Segmentation fault
root@localhost:/sys/fs/resctrl# 

This is due to pointer assignment issues, can be fixed with following changes:

diff --git a/drivers/perf/resctrl_pmu.c b/drivers/perf/resctrl_pmu.c
index 99a2b90b5d83..47e41e0f6a03 100644
--- a/drivers/perf/resctrl_pmu.c
+++ b/drivers/perf/resctrl_pmu.c
diff --git a/drivers/perf/resctrl_pmu.c b/drivers/perf/resctrl_pmu.c
index 99a2b90b5d83..47e41e0f6a03 100644
--- a/drivers/perf/resctrl_pmu.c
+++ b/drivers/perf/resctrl_pmu.c
@@ -74,7 +74,7 @@  static void resctrl_pmu_event_destroy(struct perf_event *event)
        if (!r)
                return;
 
-       resctrl_arch_mon_ctx_free(r, event_num, hwc->idx);
+       resctrl_arch_mon_ctx_free(r, event_num, &hwc->idx);
 }
 
 static int resctrl_pmu_event_init(struct perf_event *event)
@@ -144,7 +144,7 @@  static int resctrl_pmu_event_init(struct perf_event *event)
                        return -EINVAL;
        }
 
-       hwc->idx = resctrl_arch_mon_ctx_alloc_no_wait(r, event_num);
+       hwc->idx = *(int *)resctrl_arch_mon_ctx_alloc_no_wait(r, event_num);
        if (hwc->idx == -ENOSPC)
                return -ENOSPC;
        event->destroy = resctrl_pmu_event_destroy;
@@ -183,7 +183,7 @@  static void resctrl_pmu_event_update(struct perf_event *event)
                prev = local64_read(&hwc->prev_count);
 
                err = resctrl_arch_rmid_read(r, d, closid, rmid,
-                                            event_num, &now, hwc->idx);
+                                            event_num, &now, &hwc->idx);
                if (err)
                        return;


3) Finally, when we try to read the DDR BW monitors, we never hit the right code path related to DDR monitor read, and it is due to the fact that DDR BW monitor is coupled with L3 domain/resource.
    Only these two features "mpam_feat_cpor_part" and "mpam_feat_msmon_csu" are enumerated for L3 resource class. 
   
    For instance:

    root@localhost:~# cd /sys/fs/resctrl/
    root@localhost:/sys/fs/resctrl# mkdir p1
    [   61.607361] rdtgroup: XYZZY: Allocated PARTID 1 for ctrl-group 'p1'
    root@localhost:/sys/fs/resctrl# cd p1/
    root@localhost:/sys/fs/resctrl/p1# cat id
    0x5cd4f5215e8afd45
    root@localhost:/sys/fs/resctrl/p1# perf stat -e resctrl_pmu/mbm_total_bytes,resctrl_id=0x5cd4f5215e8afd45/ bw_mem -P 80 1G bzero;
    1000.00 360557.50

    Performance counter stats for 'system wide':

                 0      resctrl_pmu/mbm_total_bytes,resctrl_id=0x5cd4f5215e8afd45/                                   

      37.724019146 seconds time elapsed

and read (from Kernel buffer) following in parallel

diff --git a/drivers/platform/mpam/mpam_resctrl.c b/drivers/platform/mpam/mpam_resctrl.c
index 6e9925fcf934..8903fbdf0242 100644
--- a/drivers/platform/mpam/mpam_resctrl.c
+++ b/drivers/platform/mpam/mpam_resctrl.c
@@ -413,6 +420,10 @@  int resctrl_arch_rmid_read(struct rdt_resource     *r, struct rdt_domain *d,
                return -EINVAL;
        }
 
+       pr_info("resource name is %s\n", r->name);
+       pr_info("event id is %x\n", eventid);
+       pr_info("type set is %x\n", type);
+

diff --git a/drivers/platform/mpam/mpam_devices.c b/drivers/platform/mpam/mpam_devices.c
index aece6cab6506..7f78237a088c 100644
--- a/drivers/platform/mpam/mpam_devices.c
+++ b/drivers/platform/mpam/mpam_devices.c
@@ -1067,8 +1072,13 @@  int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx,
        if (!mpam_is_enabled())
                return -EIO;
 
-       if (!mpam_has_feature(type, cprops))
+       pr_info("type is %x\n", type);
+       pr_info("class property %x\n", cprops->features);
+
+       if (!mpam_has_feature(type, cprops)) {
+               pr_info("feature mis-match\n");
                return -EOPNOTSUPP;
+       }    

[  +0.664539] rdtgroup: XYZZY: Allocated PARTID 1 for ctrl-group 'p1'
[  +0.359458] mpam: resctrl: resource name is L3
[  +0.000001] mpam: resctrl: event id is 2
[  +0.000001] mpam: resctrl: type set is d
[  +0.000000] mpam: type is d
[  +0.000001] mpam: class property 802 (mpam_feat_cpor_part and mpam_feat_msmon_csu which is L3 resources) 
[  +0.000001] mpam: feature mis-match

4) With the following hacked changes, right code path[2] is hit, and can see prints from[2]

   diff --git a/drivers/perf/resctrl_pmu.c b/drivers/perf/resctrl_pmu.c
index 99a2b90b5d83..d30bcf0a00df 100644
--- a/drivers/perf/resctrl_pmu.c
+++ b/drivers/perf/resctrl_pmu.c
@@ -57,6 +57,7 @@  static struct rdt_resource *resctrl_event_get_resource(u16 event_num)
        switch (event_id) {
        case QOS_L3_OCCUP_EVENT_ID:
        case QOS_L3_MBM_TOTAL_EVENT_ID:
+               return resctrl_arch_get_resource(RDT_RESOURCE_MBA);
        case QOS_L3_MBM_LOCAL_EVENT_ID:
                return resctrl_arch_get_resource(RDT_RESOURCE_L3);
        }