Message ID | 20181120185814.13362-3-Kenny.Ho@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | DRM cgroup controller | expand |
Am 20.11.18 um 19:58 schrieb Kenny Ho: > Since many parts of the DRM subsystem has vendor-specific > implementations, we introduce mechanisms for vendor to register their > specific resources and control files to the DRM cgroup subsystem. A > vendor will register itself with the DRM cgroup subsystem first before > registering individual DRM devices to the cgroup subsystem. > > In addition to the cgroup_subsys_state that is common to all DRM > devices, a device-specific state is introduced and it is allocated > according to the vendor of the device. Mhm, it's most likely just a naming issue but I think we should drop the term "vendor" here and rather use "driver" instead. Background is that both Intel as well as AMD have multiple drivers for different hardware generations and we certainly don't want to handle all drivers from one vendor the same way. Christian. > > Change-Id: I908ee6975ea0585e4c30eafde4599f87094d8c65 > Signed-off-by: Kenny Ho <Kenny.Ho@amd.com> > --- > include/drm/drm_cgroup.h | 39 ++++++++++++++++ > include/drm/drmcgrp_vendors.h | 7 +++ > include/linux/cgroup_drm.h | 26 +++++++++++ > kernel/cgroup/drm.c | 84 +++++++++++++++++++++++++++++++++++ > 4 files changed, 156 insertions(+) > create mode 100644 include/drm/drm_cgroup.h > create mode 100644 include/drm/drmcgrp_vendors.h > > diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h > new file mode 100644 > index 000000000000..26cbea7059a6 > --- /dev/null > +++ b/include/drm/drm_cgroup.h > @@ -0,0 +1,39 @@ > +/* SPDX-License-Identifier: MIT > + * Copyright 2018 Advanced Micro Devices, Inc. > + */ > +#ifndef __DRM_CGROUP_H__ > +#define __DRM_CGROUP_H__ > + > +#define DRMCGRP_VENDOR(_x) _x ## _drmcgrp_vendor_id, > +enum drmcgrp_vendor_id { > +#include <drm/drmcgrp_vendors.h> > + DRMCGRP_VENDOR_COUNT, > +}; > +#undef DRMCGRP_VENDOR > + > +#define DRMCGRP_VENDOR(_x) extern struct drmcgrp_vendor _x ## _drmcgrp_vendor; > +#include <drm/drmcgrp_vendors.h> > +#undef DRMCGRP_VENDOR > + > + > + > +#ifdef CONFIG_CGROUP_DRM > + > +extern struct drmcgrp_vendor *drmcgrp_vendors[]; > + > +int drmcgrp_register_vendor(struct drmcgrp_vendor *vendor, enum drmcgrp_vendor_id id); > +int drmcgrp_register_device(struct drm_device *device, enum drmcgrp_vendor_id id); > + > +#else > +static int drmcgrp_register_vendor(struct drmcgrp_vendor *vendor, enum drmcgrp_vendor_id id) > +{ > + return 0; > +} > + > +static int drmcgrp_register_device(struct drm_device *device, enum drmcgrp_vendor_id id) > +{ > + return 0; > +} > + > +#endif /* CONFIG_CGROUP_DRM */ > +#endif /* __DRM_CGROUP_H__ */ > diff --git a/include/drm/drmcgrp_vendors.h b/include/drm/drmcgrp_vendors.h > new file mode 100644 > index 000000000000..b04d8649851b > --- /dev/null > +++ b/include/drm/drmcgrp_vendors.h > @@ -0,0 +1,7 @@ > +/* SPDX-License-Identifier: MIT > + * Copyright 2018 Advanced Micro Devices, Inc. > + */ > +#if IS_ENABLED(CONFIG_CGROUP_DRM) > + > + > +#endif > diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h > index 79ab38b0f46d..a776662d9593 100644 > --- a/include/linux/cgroup_drm.h > +++ b/include/linux/cgroup_drm.h > @@ -6,10 +6,36 @@ > > #ifdef CONFIG_CGROUP_DRM > > +#include <linux/mutex.h> > #include <linux/cgroup.h> > +#include <drm/drm_file.h> > +#include <drm/drm_cgroup.h> > + > +/* limit defined per the way drm_minor_alloc operates */ > +#define MAX_DRM_DEV (64 * DRM_MINOR_RENDER) > + > +struct drmcgrp_device { > + enum drmcgrp_vendor_id vid; > + struct drm_device *dev; > + struct mutex mutex; > +}; > + > +/* vendor-common resource counting goes here */ > +/* this struct should be included in the vendor specific resource */ > +struct drmcgrp_device_resource { > + struct drmcgrp_device *ddev; > +}; > + > +struct drmcgrp_vendor { > + struct cftype *(*get_cftypes)(void); > + struct drmcgrp_device_resource *(*alloc_dev_resource)(void); > + void (*free_dev_resource)(struct drmcgrp_device_resource *dev_resource); > +}; > + > > struct drmcgrp { > struct cgroup_subsys_state css; > + struct drmcgrp_device_resource *dev_resources[MAX_DRM_DEV]; > }; > > static inline struct drmcgrp *css_drmcgrp(struct cgroup_subsys_state *css) > diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c > index d9e194b9aead..f9630cc389bc 100644 > --- a/kernel/cgroup/drm.c > +++ b/kernel/cgroup/drm.c > @@ -1,8 +1,30 @@ > // SPDX-License-Identifier: MIT > // Copyright 2018 Advanced Micro Devices, Inc. > +#include <linux/export.h> > #include <linux/slab.h> > #include <linux/cgroup.h> > +#include <linux/fs.h> > +#include <linux/seq_file.h> > +#include <linux/mutex.h> > #include <linux/cgroup_drm.h> > +#include <drm/drm_device.h> > +#include <drm/drm_cgroup.h> > + > +/* generate an array of drm cgroup vendor pointers */ > +#define DRMCGRP_VENDOR(_x)[_x ## _drmcgrp_vendor_id] = NULL, > +struct drmcgrp_vendor *drmcgrp_vendors[] = { > +#include <drm/drmcgrp_vendors.h> > +}; > +#undef DRMCGRP_VENDOR > +EXPORT_SYMBOL(drmcgrp_vendors); > + > +static DEFINE_MUTEX(drmcgrp_mutex); > + > +/* indexed by drm_minor for access speed */ > +static struct drmcgrp_device *known_drmcgrp_devs[MAX_DRM_DEV]; > + > +static int max_minor; > + > > static u64 drmcgrp_test_read(struct cgroup_subsys_state *css, > struct cftype *cft) > @@ -13,6 +35,12 @@ static u64 drmcgrp_test_read(struct cgroup_subsys_state *css, > static void drmcgrp_css_free(struct cgroup_subsys_state *css) > { > struct drmcgrp *drmcgrp = css_drmcgrp(css); > + int i; > + > + for (i = 0; i <= max_minor; i++) { > + if (drmcgrp->dev_resources[i] != NULL) > + drmcgrp_vendors[known_drmcgrp_devs[i]->vid]->free_dev_resource(drmcgrp->dev_resources[i]); > + } > > kfree(css_drmcgrp(css)); > } > @@ -21,11 +49,27 @@ static struct cgroup_subsys_state * > drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css) > { > struct drmcgrp *drmcgrp; > + int i; > > drmcgrp = kzalloc(sizeof(struct drmcgrp), GFP_KERNEL); > if (!drmcgrp) > return ERR_PTR(-ENOMEM); > > + for (i = 0; i <= max_minor; i++) { > + if (known_drmcgrp_devs[i] != NULL) { > + struct drmcgrp_device_resource *ddr = > + drmcgrp_vendors[known_drmcgrp_devs[i]->vid]->alloc_dev_resource(); > + > + if (IS_ERR(ddr)) { > + drmcgrp_css_free(&drmcgrp->css); > + return ERR_PTR(-ENOMEM); > + } > + > + drmcgrp->dev_resources[i] = ddr; > + drmcgrp->dev_resources[i]->ddev = known_drmcgrp_devs[i]; > + } > + } > + > return &drmcgrp->css; > } > > @@ -44,3 +88,43 @@ struct cgroup_subsys drm_cgrp_subsys = { > .legacy_cftypes = files, > .dfl_cftypes = files, > }; > + > +int drmcgrp_register_vendor(struct drmcgrp_vendor *vendor, enum drmcgrp_vendor_id id) > +{ > + int rc = 0; > + struct cftype *cfts; > + > + // TODO: root css created before any registration > + if (drmcgrp_vendors[id] == NULL) { > + drmcgrp_vendors[id] = vendor; > + cfts = vendor->get_cftypes(); > + if (cfts != NULL) > + rc = cgroup_add_legacy_cftypes(&drm_cgrp_subsys, cfts); > + } > + return rc; > +} > +EXPORT_SYMBOL(drmcgrp_register_vendor); > + > + > +int drmcgrp_register_device(struct drm_device *dev, enum drmcgrp_vendor_id id) > +{ > + struct drmcgrp_device *ddev; > + > + ddev = kzalloc(sizeof(struct drmcgrp_device), GFP_KERNEL); > + if (!ddev) > + return -ENOMEM; > + > + mutex_lock(&drmcgrp_mutex); > + > + ddev->vid = id; > + ddev->dev = dev; > + mutex_init(&ddev->mutex); > + > + known_drmcgrp_devs[dev->primary->index] = ddev; > + > + max_minor = max(max_minor, dev->primary->index); > + > + mutex_unlock(&drmcgrp_mutex); > + return 0; > +} > +EXPORT_SYMBOL(drmcgrp_register_device);
Thanks Tejun,Eric and Christian for your replies. We want GPUs resource management to work seamlessly with containers and container orchestration. With the Intel / bpf based approach this is not possible. From your response we gather the following. GPU resources need to be abstracted. We will send a new proposal in same vein. Our current thinking is to start with a single abstracted resource and build a framework that can be expanded to include additional resources. We plan to start with “GPU cores”. We believe all GPUs have some concept of cores or compute unit. Your feedback is highly appreciated. Best Regards, Harish From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> Sent: Tuesday, November 20, 2018 5:30 PM To: Ho, Kenny Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices Hello, On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: > By this reply, are you suggesting that vendor specific resources > will never be acceptable to be managed under cgroup? Let say a user I wouldn't say never but whatever which gets included as a cgroup controller should have clearly defined resource abstractions and the control schemes around them including support for delegation. AFAICS, gpu side still seems to have a long way to go (and it's not clear whether that's somewhere it will or needs to end up). > want to have similar functionality as what cgroup is offering but to > manage vendor specific resources, what would you suggest as a > solution? When you say keeping vendor specific resource regulation > inside drm or specific drivers, do you mean we should replicate the > cgroup infrastructure there or do you mean either drm or specific > driver should query existing hierarchy (such as device or perhaps > cpu) for the process organization information? > > To put the questions in more concrete terms, let say a user wants to > expose certain part of a gpu to a particular cgroup similar to the > way selective cpu cores are exposed to a cgroup via cpuset, how > should we go about enabling such functionality? Do what the intel driver or bpf is doing? It's not difficult to hook into cgroup for identification purposes. Thanks.
Hi Harish, Am 26.11.18 um 21:59 schrieb Kasiviswanathan, Harish: > Thanks Tejun,Eric and Christian for your replies. > > We want GPUs resource management to work seamlessly with containers and container orchestration. With the Intel / bpf based approach this is not possible. I think one lesson learned is that we should describe this goal in the patch covert letter when sending it out. That could have avoid something like have of the initial confusion. > From your response we gather the following. GPU resources need to be abstracted. We will send a new proposal in same vein. Our current thinking is to start with a single abstracted resource and build a framework that can be expanded to include additional resources. We plan to start with “GPU cores”. We believe all GPUs have some concept of cores or compute unit. Sounds good, just one comment on creating a framework: Before doing something like this think for a moment if it doesn't make sense to rather extend the existing cgroup framework. That approach usually makes more sense because you rarely need something fundamentally new. Regards, Christian. > > Your feedback is highly appreciated. > > Best Regards, > Harish > > > > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> > Sent: Tuesday, November 20, 2018 5:30 PM > To: Ho, Kenny > Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices > > > Hello, > > On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: >> By this reply, are you suggesting that vendor specific resources >> will never be acceptable to be managed under cgroup? Let say a user > I wouldn't say never but whatever which gets included as a cgroup > controller should have clearly defined resource abstractions and the > control schemes around them including support for delegation. AFAICS, > gpu side still seems to have a long way to go (and it's not clear > whether that's somewhere it will or needs to end up). > >> want to have similar functionality as what cgroup is offering but to >> manage vendor specific resources, what would you suggest as a >> solution? When you say keeping vendor specific resource regulation >> inside drm or specific drivers, do you mean we should replicate the >> cgroup infrastructure there or do you mean either drm or specific >> driver should query existing hierarchy (such as device or perhaps >> cpu) for the process organization information? >> >> To put the questions in more concrete terms, let say a user wants to >> expose certain part of a gpu to a particular cgroup similar to the >> way selective cpu cores are exposed to a cgroup via cpuset, how >> should we go about enabling such functionality? > Do what the intel driver or bpf is doing? It's not difficult to hook > into cgroup for identification purposes. > > Thanks. >
Quoting Kasiviswanathan, Harish (2018-11-26 22:59:30) > Thanks Tejun,Eric and Christian for your replies. > > We want GPUs resource management to work seamlessly with containers and container orchestration. With the Intel / bpf based approach this is not possible. > > From your response we gather the following. GPU resources need to be abstracted. We will send a new proposal in same vein. Our current thinking is to start with a single abstracted resource and build a framework that can be expanded to include additional resources. We plan to start with “GPU cores”. We believe all GPUs have some concept of cores or compute unit. I think a more abstract property "% of GPU (processing power)" might be a more universal approach. One can then implement that through subdividing the resources or timeslicing them, depending on the GPU topology. Leasing 1/8th, 1/4th or 1/2 of the GPU would probably be the most applicable to cloud provider usecases, too. At least that's what I see done for the CPUs today. That combined with the "GPU memory usable" property should be a good starting point to start subdividing the GPU resources for multiple users. Regards, Joonas > > Your feedback is highly appreciated. > > Best Regards, > Harish > > > > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> > Sent: Tuesday, November 20, 2018 5:30 PM > To: Ho, Kenny > Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices > > > Hello, > > On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: > > By this reply, are you suggesting that vendor specific resources > > will never be acceptable to be managed under cgroup? Let say a user > > I wouldn't say never but whatever which gets included as a cgroup > controller should have clearly defined resource abstractions and the > control schemes around them including support for delegation. AFAICS, > gpu side still seems to have a long way to go (and it's not clear > whether that's somewhere it will or needs to end up). > > > want to have similar functionality as what cgroup is offering but to > > manage vendor specific resources, what would you suggest as a > > solution? When you say keeping vendor specific resource regulation > > inside drm or specific drivers, do you mean we should replicate the > > cgroup infrastructure there or do you mean either drm or specific > > driver should query existing hierarchy (such as device or perhaps > > cpu) for the process organization information? > > > > To put the questions in more concrete terms, let say a user wants to > > expose certain part of a gpu to a particular cgroup similar to the > > way selective cpu cores are exposed to a cgroup via cpuset, how > > should we go about enabling such functionality? > > Do what the intel driver or bpf is doing? It's not difficult to hook > into cgroup for identification purposes. > > Thanks. > > -- > tejun > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > amd-gfx Info Page - freedesktop.org > lists.freedesktop.org > To see the collection of prior postings to the list, visit the amd-gfx Archives.. Using amd-gfx: To post a message to all the list members, send email to amd-gfx@lists.freedesktop.org. You can subscribe to the list, or change your existing subscription, in the sections below. > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Tue, Nov 27, 2018 at 4:46 AM Joonas Lahtinen <joonas.lahtinen@linux.intel.com> wrote: > I think a more abstract property "% of GPU (processing power)" might > be a more universal approach. One can then implement that through > subdividing the resources or timeslicing them, depending on the GPU > topology. > > Leasing 1/8th, 1/4th or 1/2 of the GPU would probably be the most > applicable to cloud provider usecases, too. At least that's what I > see done for the CPUs today. I think there are opportunities to slice the gpu in more than one way (similar to the way it is done for cpu.) We can potentially frame resources as continuous or discrete. Percentage definitely fits well for continuous measurements such as time/time slices but I think there are places for discrete units such as core counts as well. Regards, Kenny > That combined with the "GPU memory usable" property should be a good > starting point to start subdividing the GPU resources for multiple > users. > > Regards, Joonas > > > > > Your feedback is highly appreciated. > > > > Best Regards, > > Harish > > > > > > > > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> > > Sent: Tuesday, November 20, 2018 5:30 PM > > To: Ho, Kenny > > Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > > Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices > > > > > > Hello, > > > > On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: > > > By this reply, are you suggesting that vendor specific resources > > > will never be acceptable to be managed under cgroup? Let say a user > > > > I wouldn't say never but whatever which gets included as a cgroup > > controller should have clearly defined resource abstractions and the > > control schemes around them including support for delegation. AFAICS, > > gpu side still seems to have a long way to go (and it's not clear > > whether that's somewhere it will or needs to end up). > > > > > want to have similar functionality as what cgroup is offering but to > > > manage vendor specific resources, what would you suggest as a > > > solution? When you say keeping vendor specific resource regulation > > > inside drm or specific drivers, do you mean we should replicate the > > > cgroup infrastructure there or do you mean either drm or specific > > > driver should query existing hierarchy (such as device or perhaps > > > cpu) for the process organization information? > > > > > > To put the questions in more concrete terms, let say a user wants to > > > expose certain part of a gpu to a particular cgroup similar to the > > > way selective cpu cores are exposed to a cgroup via cpuset, how > > > should we go about enabling such functionality? > > > > Do what the intel driver or bpf is doing? It's not difficult to hook > > into cgroup for identification purposes. > > > > Thanks. > > > > -- > > tejun > > _______________________________________________ > > amd-gfx mailing list > > amd-gfx@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > > > > amd-gfx Info Page - freedesktop.org > > lists.freedesktop.org > > To see the collection of prior postings to the list, visit the amd-gfx Archives.. Using amd-gfx: To post a message to all the list members, send email to amd-gfx@lists.freedesktop.org. You can subscribe to the list, or change your existing subscription, in the sections below. > > > > _______________________________________________ > > Intel-gfx mailing list > > Intel-gfx@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Quoting Ho, Kenny (2018-11-27 17:41:17) > On Tue, Nov 27, 2018 at 4:46 AM Joonas Lahtinen <joonas.lahtinen@linux.intel.com> wrote: > > I think a more abstract property "% of GPU (processing power)" might > > be a more universal approach. One can then implement that through > > subdividing the resources or timeslicing them, depending on the GPU > > topology. > > > > Leasing 1/8th, 1/4th or 1/2 of the GPU would probably be the most > > applicable to cloud provider usecases, too. At least that's what I > > see done for the CPUs today. > I think there are opportunities to slice the gpu in more than one way (similar to the way it is done for cpu.) We can potentially frame resources as continuous or discrete. Percentage definitely fits well for continuous measurements such as time/time slices but I think there are places for discrete units such as core counts as well. I think the ask in return to the early series from Intal was to agree on the variables that could be common to all of DRM subsystem. So we can only choose the lowest common denominator, right? Any core count out of total core count should translate nicely into a fraction, so what would be the problem with percentage amounts? Regards, Joonas > > Regards, > Kenny > > > That combined with the "GPU memory usable" property should be a good > > starting point to start subdividing the GPU resources for multiple > > users. > > > > Regards, Joonas > > > > > > > > Your feedback is highly appreciated. > > > > > > Best Regards, > > > Harish > > > > > > > > > > > > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> > > > Sent: Tuesday, November 20, 2018 5:30 PM > > > To: Ho, Kenny > > > Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > > > Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices > > > > > > > > > Hello, > > > > > > On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: > > > > By this reply, are you suggesting that vendor specific resources > > > > will never be acceptable to be managed under cgroup? Let say a user > > > > > > I wouldn't say never but whatever which gets included as a cgroup > > > controller should have clearly defined resource abstractions and the > > > control schemes around them including support for delegation. AFAICS, > > > gpu side still seems to have a long way to go (and it's not clear > > > whether that's somewhere it will or needs to end up). > > > > > > > want to have similar functionality as what cgroup is offering but to > > > > manage vendor specific resources, what would you suggest as a > > > > solution? When you say keeping vendor specific resource regulation > > > > inside drm or specific drivers, do you mean we should replicate the > > > > cgroup infrastructure there or do you mean either drm or specific > > > > driver should query existing hierarchy (such as device or perhaps > > > > cpu) for the process organization information? > > > > > > > > To put the questions in more concrete terms, let say a user wants to > > > > expose certain part of a gpu to a particular cgroup similar to the > > > > way selective cpu cores are exposed to a cgroup via cpuset, how > > > > should we go about enabling such functionality? > > > > > > Do what the intel driver or bpf is doing? It's not difficult to hook > > > into cgroup for identification purposes. > > > > > > Thanks. > > > > > > -- > > > tejun > > > _______________________________________________ > > > amd-gfx mailing list > > > amd-gfx@lists.freedesktop.org > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > > > > > > > amd-gfx Info Page - freedesktop.org > > > lists.freedesktop.org > > > To see the collection of prior postings to the list, visit the amd-gfx Archives.. Using amd-gfx: To post a message to all the list members, send email to amd-gfx@lists.freedesktop.org. You can subscribe to the list, or change your existing subscription, in the sections below. > > > > > > _______________________________________________ > > > Intel-gfx mailing list > > > Intel-gfx@lists.freedesktop.org > > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Wed, Nov 28, 2018 at 4:14 AM Joonas Lahtinen <joonas.lahtinen@linux.intel.com> wrote: > So we can only choose the lowest common denominator, right? > > Any core count out of total core count should translate nicely into a > fraction, so what would be the problem with percentage amounts? I don't think having an abstracted resource necessarily equate 'lowest'. The issue with percentage is the lack of precision. If you look at cpuset cgroup, you can see the specification can be very precise: # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 (https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt) The driver can translate something like this to core count and then to percentage and handle accordingly while the reverse is not possible. (You can't tell which set of CUs/EUs a user want from a percentage request.) It's also not clear to me, from user/application/admin/resource management perspective, how the base core counts of a GPU is relevant to the workload (since percentage is a 'relative' quantity.) For example, let say a workload wants to use 256 'cores', does it matter if that workload is put on a GPU with 1024 cores or a GPU with 4096 cores total? I am not dismissing the possible need for percentage. I just think there should be a way to accommodate more than just the 'lowest'. Regards, Kennny > > > That combined with the "GPU memory usable" property should be a good > > > starting point to start subdividing the GPU resources for multiple > > > users. > > > > > > Regards, Joonas > > > > > > > > > > > Your feedback is highly appreciated. > > > > > > > > Best Regards, > > > > Harish > > > > > > > > > > > > > > > > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> > > > > Sent: Tuesday, November 20, 2018 5:30 PM > > > > To: Ho, Kenny > > > > Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > > > > Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices > > > > > > > > > > > > Hello, > > > > > > > > On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: > > > > > By this reply, are you suggesting that vendor specific resources > > > > > will never be acceptable to be managed under cgroup? Let say a user > > > > > > > > I wouldn't say never but whatever which gets included as a cgroup > > > > controller should have clearly defined resource abstractions and the > > > > control schemes around them including support for delegation. AFAICS, > > > > gpu side still seems to have a long way to go (and it's not clear > > > > whether that's somewhere it will or needs to end up). > > > > > > > > > want to have similar functionality as what cgroup is offering but to > > > > > manage vendor specific resources, what would you suggest as a > > > > > solution? When you say keeping vendor specific resource regulation > > > > > inside drm or specific drivers, do you mean we should replicate the > > > > > cgroup infrastructure there or do you mean either drm or specific > > > > > driver should query existing hierarchy (such as device or perhaps > > > > > cpu) for the process organization information? > > > > > > > > > > To put the questions in more concrete terms, let say a user wants to > > > > > expose certain part of a gpu to a particular cgroup similar to the > > > > > way selective cpu cores are exposed to a cgroup via cpuset, how > > > > > should we go about enabling such functionality? > > > > > > > > Do what the intel driver or bpf is doing? It's not difficult to hook > > > > into cgroup for identification purposes. > > > > > > > > Thanks. > > > > > > > > -- > > > > tejun > > > > _______________________________________________ > > > > amd-gfx mailing list > > > > amd-gfx@lists.freedesktop.org > > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > > > > > > > > > > amd-gfx Info Page - freedesktop.org > > > > lists.freedesktop.org > > > > To see the collection of prior postings to the list, visit the amd-gfx Archives.. Using amd-gfx: To post a message to all the list members, send email to amd-gfx@lists.freedesktop.org. You can subscribe to the list, or change your existing subscription, in the sections below. > > > > > > > > _______________________________________________ > > > > Intel-gfx mailing list > > > > Intel-gfx@lists.freedesktop.org > > > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Wed, Nov 28, 2018 at 07:46:06PM +0000, Ho, Kenny wrote: > > On Wed, Nov 28, 2018 at 4:14 AM Joonas Lahtinen <joonas.lahtinen@linux.intel.com> wrote: > > So we can only choose the lowest common denominator, right? > > > > Any core count out of total core count should translate nicely into a > > fraction, so what would be the problem with percentage amounts? > > I don't think having an abstracted resource necessarily equate > 'lowest'. The issue with percentage is the lack of precision. If you > look at cpuset cgroup, you can see the specification can be very > precise: > > # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6 > (https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt) > > The driver can translate something like this to core count and then to > percentage and handle accordingly while the reverse is not possible. > (You can't tell which set of CUs/EUs a user want from a percentage > request.) It's also not clear to me, from > user/application/admin/resource management perspective, how the base > core counts of a GPU is relevant to the workload (since percentage is > a 'relative' quantity.) For example, let say a workload wants to use > 256 'cores', does it matter if that workload is put on a GPU with 1024 > cores or a GPU with 4096 cores total? > > I am not dismissing the possible need for percentage. I just think > there should be a way to accommodate more than just the 'lowest'. > As you noted, your proposal is similar to the cgroup-v1 "cpuset" controller, which is sort of a way of partitioning your underlying hardware resources; I think Joonas is describing something closer in design to the cgroup-v2 "cpu" controller, which partitions the general time/usage allocated to via cgroup; afaiu, "cpu" doesn't really care which specific core the tasks run on, just the relative weights that determine how much time they get to run on any of the cores. It sounds like with your hardware, your kernel driver is able to specify exactly which subset of GPU EU's a specific GPU context winds up running on. However I think there are a lot of platforms that don't allow that kind of low-level control. E.g., I don't think we can do that on Intel hardware; we have a handful of high-level GPU engines that we can submit different types of batchbuffers to (render, blitter, media, etc.). What we can do is use GPU preemption to limit how much time specific GPU contexts get to run on the render engine before the engine is reclaimed for use by a different context. Using a %gputime approach like Joonas is suggesting could be handled in a driver by reserving specific subsets of EU's on hardware like yours that's capable of doing that, whereas it could be mostly handled on other types of hardware via GPU engine preemption. I think either approach "gpu_euset" or "%gputime" should map well to a cgroup controller implementation. Granted, neither one solves the specific use case I was working on earlier this year where we need unfair (starvation-okay) scheduling that will run contexts strictly according to priority (i.e., lower priority contexts will never run at all unless all higher priority contexts have completed all of their submitted work), but that's a pretty specialized use case that we'll probably need to handle in a different manner anyway. Matt > Regards, > Kennny > > > > > > That combined with the "GPU memory usable" property should be a good > > > > starting point to start subdividing the GPU resources for multiple > > > > users. > > > > > > > > Regards, Joonas > > > > > > > > > > > > > > Your feedback is highly appreciated. > > > > > > > > > > Best Regards, > > > > > Harish > > > > > > > > > > > > > > > > > > > > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> > > > > > Sent: Tuesday, November 20, 2018 5:30 PM > > > > > To: Ho, Kenny > > > > > Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > > > > > Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices > > > > > > > > > > > > > > > Hello, > > > > > > > > > > On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: > > > > > > By this reply, are you suggesting that vendor specific resources > > > > > > will never be acceptable to be managed under cgroup? Let say a user > > > > > > > > > > I wouldn't say never but whatever which gets included as a cgroup > > > > > controller should have clearly defined resource abstractions and the > > > > > control schemes around them including support for delegation. AFAICS, > > > > > gpu side still seems to have a long way to go (and it's not clear > > > > > whether that's somewhere it will or needs to end up). > > > > > > > > > > > want to have similar functionality as what cgroup is offering but to > > > > > > manage vendor specific resources, what would you suggest as a > > > > > > solution? When you say keeping vendor specific resource regulation > > > > > > inside drm or specific drivers, do you mean we should replicate the > > > > > > cgroup infrastructure there or do you mean either drm or specific > > > > > > driver should query existing hierarchy (such as device or perhaps > > > > > > cpu) for the process organization information? > > > > > > > > > > > > To put the questions in more concrete terms, let say a user wants to > > > > > > expose certain part of a gpu to a particular cgroup similar to the > > > > > > way selective cpu cores are exposed to a cgroup via cpuset, how > > > > > > should we go about enabling such functionality? > > > > > > > > > > Do what the intel driver or bpf is doing? It's not difficult to hook > > > > > into cgroup for identification purposes. > > > > > > > > > > Thanks. > > > > > > > > > > -- > > > > > tejun > > > > > _______________________________________________ > > > > > amd-gfx mailing list > > > > > amd-gfx@lists.freedesktop.org > > > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > > > > > > > > > > > > > amd-gfx Info Page - freedesktop.org > > > > > lists.freedesktop.org > > > > > To see the collection of prior postings to the list, visit the amd-gfx Archives.. Using amd-gfx: To post a message to all the list members, send email to amd-gfx@lists.freedesktop.org. You can subscribe to the list, or change your existing subscription, in the sections below. > > > > > > > > > > _______________________________________________ > > > > > Intel-gfx mailing list > > > > > Intel-gfx@lists.freedesktop.org > > > > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Hey Matt, On Fri, Nov 30, 2018 at 5:22 PM Matt Roper <matthew.d.roper@intel.com> wrote: > I think Joonas is describing something closer in > design to the cgroup-v2 "cpu" controller, which partitions the general > time/usage allocated to via cgroup; afaiu, "cpu" doesn't really care > which specific core the tasks run on, just the relative weights that > determine how much time they get to run on any of the cores. Depending on the level of optimization one wants to do, I think people care about which cpu core a task runs on. Modern processors are no longer a monolithic 'thing'. At least for AMD, there are multiple cpus on a core complex (CCX), multiple CCX on a die, and multiple dies on a processor. A task running on cpu 0 and cpu 1 on die 0 will behave very differently from a task running on core 0s on die 0 and die 1 on the same socket. (https://en.wikichip.org/wiki/amd/microarchitectures/zen#Die-die_memory_latencies) It's not just an AMD thing either. Here is an open issue on Intel's architecture: https://github.com/kubernetes/kubernetes/issues/67355 and a proposed solution using cpu affinity https://github.com/kubernetes/community/blob/630acc487c80e4981a232cdd8400eb8207119788/keps/sig-node/0030-qos-class-cpu-affinity.md#proposal (by one of your colleagues.) The time-based sharing below is also something we are thinking about, but it's personally not as exciting as the resource-based sharing for me because the time-share use case has already been addressed by our SRIOV/virtualization products. We can potentially have different level of time sharing using cgroup though (in addition to SRIOV), potentially trading efficiency against isolation. That said, I think the time-based approach maybe orthogonal to the resource-based approach (orthogonal in the sense that both are needed depending on the usage.) Regards, Kenny > It sounds like with your hardware, your kernel driver is able to specify > exactly which subset of GPU EU's a specific GPU context winds up running > on. However I think there are a lot of platforms that don't allow that > kind of low-level control. E.g., I don't think we can do that on Intel > hardware; we have a handful of high-level GPU engines that we can submit > different types of batchbuffers to (render, blitter, media, etc.). What > we can do is use GPU preemption to limit how much time specific GPU > contexts get to run on the render engine before the engine is reclaimed > for use by a different context. > > Using a %gputime approach like Joonas is suggesting could be handled in > a driver by reserving specific subsets of EU's on hardware like yours > that's capable of doing that, whereas it could be mostly handled on > other types of hardware via GPU engine preemption. > > I think either approach "gpu_euset" or "%gputime" should map well to a > cgroup controller implementation. Granted, neither one solves the > specific use case I was working on earlier this year where we need > unfair (starvation-okay) scheduling that will run contexts strictly > according to priority (i.e., lower priority contexts will never run at > all unless all higher priority contexts have completed all of their > submitted work), but that's a pretty specialized use case that we'll > probably need to handle in a different manner anyway. > > > Matt > > > > Regards, > > Kennny > > > > > > > > > That combined with the "GPU memory usable" property should be a good > > > > > starting point to start subdividing the GPU resources for multiple > > > > > users. > > > > > > > > > > Regards, Joonas > > > > > > > > > > > > > > > > > Your feedback is highly appreciated. > > > > > > > > > > > > Best Regards, > > > > > > Harish > > > > > > > > > > > > > > > > > > > > > > > > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> > > > > > > Sent: Tuesday, November 20, 2018 5:30 PM > > > > > > To: Ho, Kenny > > > > > > Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > > > > > > Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices > > > > > > > > > > > > > > > > > > Hello, > > > > > > > > > > > > On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: > > > > > > > By this reply, are you suggesting that vendor specific resources > > > > > > > will never be acceptable to be managed under cgroup? Let say a user > > > > > > > > > > > > I wouldn't say never but whatever which gets included as a cgroup > > > > > > controller should have clearly defined resource abstractions and the > > > > > > control schemes around them including support for delegation. AFAICS, > > > > > > gpu side still seems to have a long way to go (and it's not clear > > > > > > whether that's somewhere it will or needs to end up). > > > > > > > > > > > > > want to have similar functionality as what cgroup is offering but to > > > > > > > manage vendor specific resources, what would you suggest as a > > > > > > > solution? When you say keeping vendor specific resource regulation > > > > > > > inside drm or specific drivers, do you mean we should replicate the > > > > > > > cgroup infrastructure there or do you mean either drm or specific > > > > > > > driver should query existing hierarchy (such as device or perhaps > > > > > > > cpu) for the process organization information? > > > > > > > > > > > > > > To put the questions in more concrete terms, let say a user wants to > > > > > > > expose certain part of a gpu to a particular cgroup similar to the > > > > > > > way selective cpu cores are exposed to a cgroup via cpuset, how > > > > > > > should we go about enabling such functionality? > > > > > > > > > > > > Do what the intel driver or bpf is doing? It's not difficult to hook > > > > > > into cgroup for identification purposes. > > > > > > > > > > > > Thanks. > > > > > > > > > > > > -- > > > > > > tejun > > > > > > _______________________________________________ > > > > > > amd-gfx mailing list > > > > > > amd-gfx@lists.freedesktop.org > > > > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > > > > > > > > > > > > > > > > amd-gfx Info Page - freedesktop.org > > > > > > lists.freedesktop.org > > > > > > To see the collection of prior postings to the list, visit the amd-gfx Archives.. Using amd-gfx: To post a message to all the list members, send email to amd-gfx@lists.freedesktop.org. You can subscribe to the list, or change your existing subscription, in the sections below. > > > > > > > > > > > > _______________________________________________ > > > > > > Intel-gfx mailing list > > > > > > Intel-gfx@lists.freedesktop.org > > > > > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx > > -- > Matt Roper > Graphics Software Engineer > IoTG Platform Enabling & Development > Intel Corporation > (916) 356-2795
On Mon, Dec 03, 2018 at 06:46:01AM +0000, Ho, Kenny wrote: > Hey Matt, > > On Fri, Nov 30, 2018 at 5:22 PM Matt Roper <matthew.d.roper@intel.com> wrote: > > I think Joonas is describing something closer in > > design to the cgroup-v2 "cpu" controller, which partitions the general > > time/usage allocated to via cgroup; afaiu, "cpu" doesn't really care > > which specific core the tasks run on, just the relative weights that > > determine how much time they get to run on any of the cores. > > Depending on the level of optimization one wants to do, I think people > care about which cpu core a task runs on. Modern processors are no > longer a monolithic 'thing'. At least for AMD, there are multiple > cpus on a core complex (CCX), multiple CCX on a die, and multiple dies > on a processor. A task running on cpu 0 and cpu 1 on die 0 will > behave very differently from a task running on core 0s on die 0 and > die 1 on the same socket. > (https://en.wikichip.org/wiki/amd/microarchitectures/zen#Die-die_memory_latencies) > > It's not just an AMD thing either. Here is an open issue on Intel's architecture: > https://github.com/kubernetes/kubernetes/issues/67355 > > and a proposed solution using cpu affinity > https://github.com/kubernetes/community/blob/630acc487c80e4981a232cdd8400eb8207119788/keps/sig-node/0030-qos-class-cpu-affinity.md#proposal > (by one of your colleagues.) Right, I didn't mean to imply that the use case wasn't valid, I was just referring to how I believe the cgroup-v2 'cpu' controller (i.e., cpu_cgrp_subsys) currently behaves, as a contrast to the behavior of the cgroup-v1 'cpuset' controller. I can definitely understand your motivation for wanting something along the lines of a "gpuset" controller, but as far as I know, that just isn't something that's possible to implement on a lot of GPU's. > > The time-based sharing below is also something we are thinking about, > but it's personally not as exciting as the resource-based sharing for > me because the time-share use case has already been addressed by our > SRIOV/virtualization products. We can potentially have different > level of time sharing using cgroup though (in addition to SRIOV), > potentially trading efficiency against isolation. That said, I think > the time-based approach maybe orthogonal to the resource-based > approach (orthogonal in the sense that both are needed depending on > the usage.) Makes sense. Matt > > Regards, > Kenny > > > > It sounds like with your hardware, your kernel driver is able to specify > > exactly which subset of GPU EU's a specific GPU context winds up running > > on. However I think there are a lot of platforms that don't allow that > > kind of low-level control. E.g., I don't think we can do that on Intel > > hardware; we have a handful of high-level GPU engines that we can submit > > different types of batchbuffers to (render, blitter, media, etc.). What > > we can do is use GPU preemption to limit how much time specific GPU > > contexts get to run on the render engine before the engine is reclaimed > > for use by a different context. > > > > Using a %gputime approach like Joonas is suggesting could be handled in > > a driver by reserving specific subsets of EU's on hardware like yours > > that's capable of doing that, whereas it could be mostly handled on > > other types of hardware via GPU engine preemption. > > > > I think either approach "gpu_euset" or "%gputime" should map well to a > > cgroup controller implementation. Granted, neither one solves the > > specific use case I was working on earlier this year where we need > > unfair (starvation-okay) scheduling that will run contexts strictly > > according to priority (i.e., lower priority contexts will never run at > > all unless all higher priority contexts have completed all of their > > submitted work), but that's a pretty specialized use case that we'll > > probably need to handle in a different manner anyway. > > > > > > Matt > > > > > > > Regards, > > > Kennny > > > > > > > > > > > > That combined with the "GPU memory usable" property should be a good > > > > > > starting point to start subdividing the GPU resources for multiple > > > > > > users. > > > > > > > > > > > > Regards, Joonas > > > > > > > > > > > > > > > > > > > > Your feedback is highly appreciated. > > > > > > > > > > > > > > Best Regards, > > > > > > > Harish > > > > > > > > > > > > > > > > > > > > > > > > > > > > From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> > > > > > > > Sent: Tuesday, November 20, 2018 5:30 PM > > > > > > > To: Ho, Kenny > > > > > > > Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > > > > > > > Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices > > > > > > > > > > > > > > > > > > > > > Hello, > > > > > > > > > > > > > > On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: > > > > > > > > By this reply, are you suggesting that vendor specific resources > > > > > > > > will never be acceptable to be managed under cgroup? Let say a user > > > > > > > > > > > > > > I wouldn't say never but whatever which gets included as a cgroup > > > > > > > controller should have clearly defined resource abstractions and the > > > > > > > control schemes around them including support for delegation. AFAICS, > > > > > > > gpu side still seems to have a long way to go (and it's not clear > > > > > > > whether that's somewhere it will or needs to end up). > > > > > > > > > > > > > > > want to have similar functionality as what cgroup is offering but to > > > > > > > > manage vendor specific resources, what would you suggest as a > > > > > > > > solution? When you say keeping vendor specific resource regulation > > > > > > > > inside drm or specific drivers, do you mean we should replicate the > > > > > > > > cgroup infrastructure there or do you mean either drm or specific > > > > > > > > driver should query existing hierarchy (such as device or perhaps > > > > > > > > cpu) for the process organization information? > > > > > > > > > > > > > > > > To put the questions in more concrete terms, let say a user wants to > > > > > > > > expose certain part of a gpu to a particular cgroup similar to the > > > > > > > > way selective cpu cores are exposed to a cgroup via cpuset, how > > > > > > > > should we go about enabling such functionality? > > > > > > > > > > > > > > Do what the intel driver or bpf is doing? It's not difficult to hook > > > > > > > into cgroup for identification purposes. > > > > > > > > > > > > > > Thanks. > > > > > > > > > > > > > > -- > > > > > > > tejun > > > > > > > _______________________________________________ > > > > > > > amd-gfx mailing list > > > > > > > amd-gfx@lists.freedesktop.org > > > > > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > > > > > > > > > > > > > > > > > > > amd-gfx Info Page - freedesktop.org > > > > > > > lists.freedesktop.org > > > > > > > To see the collection of prior postings to the list, visit the amd-gfx Archives.. Using amd-gfx: To post a message to all the list members, send email to amd-gfx@lists.freedesktop.org. You can subscribe to the list, or change your existing subscription, in the sections below. > > > > > > > > > > > > > > _______________________________________________ > > > > > > > Intel-gfx mailing list > > > > > > > Intel-gfx@lists.freedesktop.org > > > > > > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx > > > > -- > > Matt Roper > > Graphics Software Engineer > > IoTG Platform Enabling & Development > > Intel Corporation > > (916) 356-2795
On 2018-11-28 4:14 a.m., Joonas Lahtinen wrote: > Quoting Ho, Kenny (2018-11-27 17:41:17) >> On Tue, Nov 27, 2018 at 4:46 AM Joonas Lahtinen <joonas.lahtinen@linux.intel.com> wrote: >>> I think a more abstract property "% of GPU (processing power)" might >>> be a more universal approach. One can then implement that through >>> subdividing the resources or timeslicing them, depending on the GPU >>> topology. >>> >>> Leasing 1/8th, 1/4th or 1/2 of the GPU would probably be the most >>> applicable to cloud provider usecases, too. At least that's what I >>> see done for the CPUs today. >> I think there are opportunities to slice the gpu in more than one way (similar to the way it is done for cpu.) We can potentially frame resources as continuous or discrete. Percentage definitely fits well for continuous measurements such as time/time slices but I think there are places for discrete units such as core counts as well. > I think the ask in return to the early series from Intal was to agree > on the variables that could be common to all of DRM subsystem. > > So we can only choose the lowest common denominator, right? > > Any core count out of total core count should translate nicely into a > fraction, so what would be the problem with percentage amounts? How would you handle overcommitment with a percentage? That is, more than 100% of the GPU cores assigned to cgroups. Which cgroups end up sharing cores would be up to chance. If we allow specifying a set of GPU cores, we can be more specific in assigning and sharing resources between cgroups. Regards, Felix > > Regards, Joonas > >> Regards, >> Kenny >> >>> That combined with the "GPU memory usable" property should be a good >>> starting point to start subdividing the GPU resources for multiple >>> users. >>> >>> Regards, Joonas >>> >>>> Your feedback is highly appreciated. >>>> >>>> Best Regards, >>>> Harish >>>> >>>> >>>> >>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> >>>> Sent: Tuesday, November 20, 2018 5:30 PM >>>> To: Ho, Kenny >>>> Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org >>>> Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices >>>> >>>> >>>> Hello, >>>> >>>> On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: >>>>> By this reply, are you suggesting that vendor specific resources >>>>> will never be acceptable to be managed under cgroup? Let say a user >>>> I wouldn't say never but whatever which gets included as a cgroup >>>> controller should have clearly defined resource abstractions and the >>>> control schemes around them including support for delegation. AFAICS, >>>> gpu side still seems to have a long way to go (and it's not clear >>>> whether that's somewhere it will or needs to end up). >>>> >>>>> want to have similar functionality as what cgroup is offering but to >>>>> manage vendor specific resources, what would you suggest as a >>>>> solution? When you say keeping vendor specific resource regulation >>>>> inside drm or specific drivers, do you mean we should replicate the >>>>> cgroup infrastructure there or do you mean either drm or specific >>>>> driver should query existing hierarchy (such as device or perhaps >>>>> cpu) for the process organization information? >>>>> >>>>> To put the questions in more concrete terms, let say a user wants to >>>>> expose certain part of a gpu to a particular cgroup similar to the >>>>> way selective cpu cores are exposed to a cgroup via cpuset, how >>>>> should we go about enabling such functionality? >>>> Do what the intel driver or bpf is doing? It's not difficult to hook >>>> into cgroup for identification purposes. >>>> >>>> Thanks. >>>> >>>> -- >>>> tejun >>>> _______________________________________________ >>>> amd-gfx mailing list >>>> amd-gfx@lists.freedesktop.org >>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >>>> >>>> >>>> amd-gfx Info Page - freedesktop.org >>>> lists.freedesktop.org >>>> To see the collection of prior postings to the list, visit the amd-gfx Archives.. Using amd-gfx: To post a message to all the list members, send email to amd-gfx@lists.freedesktop.org. You can subscribe to the list, or change your existing subscription, in the sections below. >>>> >>>> _______________________________________________ >>>> Intel-gfx mailing list >>>> Intel-gfx@lists.freedesktop.org >>>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Quoting Kuehling, Felix (2018-12-03 22:55:16) > > On 2018-11-28 4:14 a.m., Joonas Lahtinen wrote: > > Quoting Ho, Kenny (2018-11-27 17:41:17) > >> On Tue, Nov 27, 2018 at 4:46 AM Joonas Lahtinen <joonas.lahtinen@linux.intel.com> wrote: > >>> I think a more abstract property "% of GPU (processing power)" might > >>> be a more universal approach. One can then implement that through > >>> subdividing the resources or timeslicing them, depending on the GPU > >>> topology. > >>> > >>> Leasing 1/8th, 1/4th or 1/2 of the GPU would probably be the most > >>> applicable to cloud provider usecases, too. At least that's what I > >>> see done for the CPUs today. > >> I think there are opportunities to slice the gpu in more than one way (similar to the way it is done for cpu.) We can potentially frame resources as continuous or discrete. Percentage definitely fits well for continuous measurements such as time/time slices but I think there are places for discrete units such as core counts as well. > > I think the ask in return to the early series from Intal was to agree > > on the variables that could be common to all of DRM subsystem. > > > > So we can only choose the lowest common denominator, right? > > > > Any core count out of total core count should translate nicely into a > > fraction, so what would be the problem with percentage amounts? > How would you handle overcommitment with a percentage? That is, more > than 100% of the GPU cores assigned to cgroups. Which cgroups end up > sharing cores would be up to chance. I see your point. With time-slicing, you really can't overcommit. So would assume that there would have to be second level of detail provided for overcommitting (and deciding which cgroups are to share GPU cores). > If we allow specifying a set of GPU cores, we can be more specific in > assigning and sharing resources between cgroups. As Matt outlined in the other reply to this thread, we don't really have the concept of GPU cores. We do have the command streamers, but the granularity is bit low. In your architecture, does it matter which specific cores are shared, or is it just a question of which specific cgroups would share some cores in case of overcommit? If we tack in the priority in addition to the percentage, you could make a choice to share cores only at an identical priority level only. That'd mean that in the case of overcommit, you'd aim to keep as many high priority levels free of overcommit as possible and then for lower priority cgroups you'd start overcommitting. Would that even partially address the concern? Regards, Joonas > > Regards, > Felix > > > > > > Regards, Joonas > > > >> Regards, > >> Kenny > >> > >>> That combined with the "GPU memory usable" property should be a good > >>> starting point to start subdividing the GPU resources for multiple > >>> users. > >>> > >>> Regards, Joonas > >>> > >>>> Your feedback is highly appreciated. > >>>> > >>>> Best Regards, > >>>> Harish > >>>> > >>>> > >>>> > >>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Tejun Heo <tj@kernel.org> > >>>> Sent: Tuesday, November 20, 2018 5:30 PM > >>>> To: Ho, Kenny > >>>> Cc: cgroups@vger.kernel.org; intel-gfx@lists.freedesktop.org; y2kenny@gmail.com; amd-gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org > >>>> Subject: Re: [PATCH RFC 2/5] cgroup: Add mechanism to register vendor specific DRM devices > >>>> > >>>> > >>>> Hello, > >>>> > >>>> On Tue, Nov 20, 2018 at 10:21:14PM +0000, Ho, Kenny wrote: > >>>>> By this reply, are you suggesting that vendor specific resources > >>>>> will never be acceptable to be managed under cgroup? Let say a user > >>>> I wouldn't say never but whatever which gets included as a cgroup > >>>> controller should have clearly defined resource abstractions and the > >>>> control schemes around them including support for delegation. AFAICS, > >>>> gpu side still seems to have a long way to go (and it's not clear > >>>> whether that's somewhere it will or needs to end up). > >>>> > >>>>> want to have similar functionality as what cgroup is offering but to > >>>>> manage vendor specific resources, what would you suggest as a > >>>>> solution? When you say keeping vendor specific resource regulation > >>>>> inside drm or specific drivers, do you mean we should replicate the > >>>>> cgroup infrastructure there or do you mean either drm or specific > >>>>> driver should query existing hierarchy (such as device or perhaps > >>>>> cpu) for the process organization information? > >>>>> > >>>>> To put the questions in more concrete terms, let say a user wants to > >>>>> expose certain part of a gpu to a particular cgroup similar to the > >>>>> way selective cpu cores are exposed to a cgroup via cpuset, how > >>>>> should we go about enabling such functionality? > >>>> Do what the intel driver or bpf is doing? It's not difficult to hook > >>>> into cgroup for identification purposes. > >>>> > >>>> Thanks. > >>>> > >>>> -- > >>>> tejun > >>>> _______________________________________________ > >>>> amd-gfx mailing list > >>>> amd-gfx@lists.freedesktop.org > >>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx > >>>> > >>>> > >>>> amd-gfx Info Page - freedesktop.org > >>>> lists.freedesktop.org > >>>> To see the collection of prior postings to the list, visit the amd-gfx Archives.. Using amd-gfx: To post a message to all the list members, send email to amd-gfx@lists.freedesktop.org. You can subscribe to the list, or change your existing subscription, in the sections below. > >>>> > >>>> _______________________________________________ > >>>> Intel-gfx mailing list > >>>> Intel-gfx@lists.freedesktop.org > >>>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx > > _______________________________________________ > > amd-gfx mailing list > > amd-gfx@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h new file mode 100644 index 000000000000..26cbea7059a6 --- /dev/null +++ b/include/drm/drm_cgroup.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: MIT + * Copyright 2018 Advanced Micro Devices, Inc. + */ +#ifndef __DRM_CGROUP_H__ +#define __DRM_CGROUP_H__ + +#define DRMCGRP_VENDOR(_x) _x ## _drmcgrp_vendor_id, +enum drmcgrp_vendor_id { +#include <drm/drmcgrp_vendors.h> + DRMCGRP_VENDOR_COUNT, +}; +#undef DRMCGRP_VENDOR + +#define DRMCGRP_VENDOR(_x) extern struct drmcgrp_vendor _x ## _drmcgrp_vendor; +#include <drm/drmcgrp_vendors.h> +#undef DRMCGRP_VENDOR + + + +#ifdef CONFIG_CGROUP_DRM + +extern struct drmcgrp_vendor *drmcgrp_vendors[]; + +int drmcgrp_register_vendor(struct drmcgrp_vendor *vendor, enum drmcgrp_vendor_id id); +int drmcgrp_register_device(struct drm_device *device, enum drmcgrp_vendor_id id); + +#else +static int drmcgrp_register_vendor(struct drmcgrp_vendor *vendor, enum drmcgrp_vendor_id id) +{ + return 0; +} + +static int drmcgrp_register_device(struct drm_device *device, enum drmcgrp_vendor_id id) +{ + return 0; +} + +#endif /* CONFIG_CGROUP_DRM */ +#endif /* __DRM_CGROUP_H__ */ diff --git a/include/drm/drmcgrp_vendors.h b/include/drm/drmcgrp_vendors.h new file mode 100644 index 000000000000..b04d8649851b --- /dev/null +++ b/include/drm/drmcgrp_vendors.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: MIT + * Copyright 2018 Advanced Micro Devices, Inc. + */ +#if IS_ENABLED(CONFIG_CGROUP_DRM) + + +#endif diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h index 79ab38b0f46d..a776662d9593 100644 --- a/include/linux/cgroup_drm.h +++ b/include/linux/cgroup_drm.h @@ -6,10 +6,36 @@ #ifdef CONFIG_CGROUP_DRM +#include <linux/mutex.h> #include <linux/cgroup.h> +#include <drm/drm_file.h> +#include <drm/drm_cgroup.h> + +/* limit defined per the way drm_minor_alloc operates */ +#define MAX_DRM_DEV (64 * DRM_MINOR_RENDER) + +struct drmcgrp_device { + enum drmcgrp_vendor_id vid; + struct drm_device *dev; + struct mutex mutex; +}; + +/* vendor-common resource counting goes here */ +/* this struct should be included in the vendor specific resource */ +struct drmcgrp_device_resource { + struct drmcgrp_device *ddev; +}; + +struct drmcgrp_vendor { + struct cftype *(*get_cftypes)(void); + struct drmcgrp_device_resource *(*alloc_dev_resource)(void); + void (*free_dev_resource)(struct drmcgrp_device_resource *dev_resource); +}; + struct drmcgrp { struct cgroup_subsys_state css; + struct drmcgrp_device_resource *dev_resources[MAX_DRM_DEV]; }; static inline struct drmcgrp *css_drmcgrp(struct cgroup_subsys_state *css) diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c index d9e194b9aead..f9630cc389bc 100644 --- a/kernel/cgroup/drm.c +++ b/kernel/cgroup/drm.c @@ -1,8 +1,30 @@ // SPDX-License-Identifier: MIT // Copyright 2018 Advanced Micro Devices, Inc. +#include <linux/export.h> #include <linux/slab.h> #include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/mutex.h> #include <linux/cgroup_drm.h> +#include <drm/drm_device.h> +#include <drm/drm_cgroup.h> + +/* generate an array of drm cgroup vendor pointers */ +#define DRMCGRP_VENDOR(_x)[_x ## _drmcgrp_vendor_id] = NULL, +struct drmcgrp_vendor *drmcgrp_vendors[] = { +#include <drm/drmcgrp_vendors.h> +}; +#undef DRMCGRP_VENDOR +EXPORT_SYMBOL(drmcgrp_vendors); + +static DEFINE_MUTEX(drmcgrp_mutex); + +/* indexed by drm_minor for access speed */ +static struct drmcgrp_device *known_drmcgrp_devs[MAX_DRM_DEV]; + +static int max_minor; + static u64 drmcgrp_test_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -13,6 +35,12 @@ static u64 drmcgrp_test_read(struct cgroup_subsys_state *css, static void drmcgrp_css_free(struct cgroup_subsys_state *css) { struct drmcgrp *drmcgrp = css_drmcgrp(css); + int i; + + for (i = 0; i <= max_minor; i++) { + if (drmcgrp->dev_resources[i] != NULL) + drmcgrp_vendors[known_drmcgrp_devs[i]->vid]->free_dev_resource(drmcgrp->dev_resources[i]); + } kfree(css_drmcgrp(css)); } @@ -21,11 +49,27 @@ static struct cgroup_subsys_state * drmcgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct drmcgrp *drmcgrp; + int i; drmcgrp = kzalloc(sizeof(struct drmcgrp), GFP_KERNEL); if (!drmcgrp) return ERR_PTR(-ENOMEM); + for (i = 0; i <= max_minor; i++) { + if (known_drmcgrp_devs[i] != NULL) { + struct drmcgrp_device_resource *ddr = + drmcgrp_vendors[known_drmcgrp_devs[i]->vid]->alloc_dev_resource(); + + if (IS_ERR(ddr)) { + drmcgrp_css_free(&drmcgrp->css); + return ERR_PTR(-ENOMEM); + } + + drmcgrp->dev_resources[i] = ddr; + drmcgrp->dev_resources[i]->ddev = known_drmcgrp_devs[i]; + } + } + return &drmcgrp->css; } @@ -44,3 +88,43 @@ struct cgroup_subsys drm_cgrp_subsys = { .legacy_cftypes = files, .dfl_cftypes = files, }; + +int drmcgrp_register_vendor(struct drmcgrp_vendor *vendor, enum drmcgrp_vendor_id id) +{ + int rc = 0; + struct cftype *cfts; + + // TODO: root css created before any registration + if (drmcgrp_vendors[id] == NULL) { + drmcgrp_vendors[id] = vendor; + cfts = vendor->get_cftypes(); + if (cfts != NULL) + rc = cgroup_add_legacy_cftypes(&drm_cgrp_subsys, cfts); + } + return rc; +} +EXPORT_SYMBOL(drmcgrp_register_vendor); + + +int drmcgrp_register_device(struct drm_device *dev, enum drmcgrp_vendor_id id) +{ + struct drmcgrp_device *ddev; + + ddev = kzalloc(sizeof(struct drmcgrp_device), GFP_KERNEL); + if (!ddev) + return -ENOMEM; + + mutex_lock(&drmcgrp_mutex); + + ddev->vid = id; + ddev->dev = dev; + mutex_init(&ddev->mutex); + + known_drmcgrp_devs[dev->primary->index] = ddev; + + max_minor = max(max_minor, dev->primary->index); + + mutex_unlock(&drmcgrp_mutex); + return 0; +} +EXPORT_SYMBOL(drmcgrp_register_device);
Since many parts of the DRM subsystem has vendor-specific implementations, we introduce mechanisms for vendor to register their specific resources and control files to the DRM cgroup subsystem. A vendor will register itself with the DRM cgroup subsystem first before registering individual DRM devices to the cgroup subsystem. In addition to the cgroup_subsys_state that is common to all DRM devices, a device-specific state is introduced and it is allocated according to the vendor of the device. Change-Id: I908ee6975ea0585e4c30eafde4599f87094d8c65 Signed-off-by: Kenny Ho <Kenny.Ho@amd.com> --- include/drm/drm_cgroup.h | 39 ++++++++++++++++ include/drm/drmcgrp_vendors.h | 7 +++ include/linux/cgroup_drm.h | 26 +++++++++++ kernel/cgroup/drm.c | 84 +++++++++++++++++++++++++++++++++++ 4 files changed, 156 insertions(+) create mode 100644 include/drm/drm_cgroup.h create mode 100644 include/drm/drmcgrp_vendors.h