drm/i915: Update MOCS settings for gen 9

Message ID	20170426150041.15896-1-david.weinehall@linux.intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: David Weinehall <david.weinehall@linux.intel.com> To: intel-gfx@lists.freedesktop.org Date: Wed, 26 Apr 2017 18:00:41 +0300 Message-Id: <20170426150041.15896-1-david.weinehall@linux.intel.com> Subject: [Intel-gfx] [PATCH] drm/i915: Update MOCS settings for gen 9 Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

David Weinehall April 26, 2017, 3 p.m. UTC

Add a bunch of MOCS entries for gen 9 that were missing from intel_mocs.
Some of these are used by media-sdk; if these entries are missing
the default will instead be to do everything uncached.

This patch improves media-sdk performance with up to 60%
with the (admittedly synthetic) benchmarks we use in our nightly
testing, without regressing any other benchmarks.

Signed-off-by: David Weinehall <david.weinehall@linux.intel.com>

Francisco Jerez April 26, 2017, 5:25 p.m. UTC | #1

Hi David,

David Weinehall <david.weinehall@linux.intel.com> writes:

> Add a bunch of MOCS entries for gen 9 that were missing from intel_mocs.
> Some of these are used by media-sdk; if these entries are missing
> the default will instead be to do everything uncached.
>

Keep in mind that MOCS entries are not for free -- Once you expose a new
entry it will become part of the kernel ABI and we won't ever be able to
re-use them for another purpose.  What this means is we probably don't
want to add a bunch of entries if only "some" of them are actually
useful to the open-source stack.  Some more comments below.

> This patch improves media-sdk performance with up to 60%
> with the (admittedly synthetic) benchmarks we use in our nightly
> testing, without regressing any other benchmarks.
>
> Signed-off-by: David Weinehall <david.weinehall@linux.intel.com>
>
> diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c
> index 92e461c68385..5236a442a14f 100644
> --- a/drivers/gpu/drm/i915/intel_mocs.c
> +++ b/drivers/gpu/drm/i915/intel_mocs.c
> @@ -103,7 +103,6 @@ static const struct drm_i915_mocs_entry skylake_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0010 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
>  	},
> @@ -123,7 +122,133 @@ static const struct drm_i915_mocs_entry skylake_mocs_table[] = {
>  			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
>  	  /* 0x0030 */
> -	  .l3cc_value =   L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{

Apparently somebody started defining symbolic enums for the current MOCS
entries in i915_drm.h, which I guess makes this more obviously tied to
the kernel ABI.  I suppose it wouldn't hurt if you did the same.

> +	  /* 0x00000037 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000039 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000033 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},

What's the use of an entry with PTE-controlled target cache if you're
overriding the cacheability control to WB anyway?  Seems effectively
redundant with the current I915_MOCS_CACHED entry.  Same for the
0x33:0x10, 0x13:0x10 and 0x3:0x10 entries below.

> +	{
> +	  /* 0x00000017 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000037 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000039 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},

This entry is UC in all caches and seems redundant with the current
I915_MOCS_UNCACHED entry.  Why do you need it?

> +	{
> +	  /* 0x00000017 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x0000003b */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000033 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000019 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},

Another entry marked UC for all caches.

> +	{
> +	  /* 0x00000003 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x0000001b */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x0000001b */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000013 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
>  	},
>  };
>
> @@ -135,7 +260,6 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0010 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
>  	},
> @@ -145,7 +269,6 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0030 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
>  	},
> @@ -155,10 +278,36 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0030 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
>  	},
> +	{
> +	  /* 0x00000005 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000001 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000005 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
>  };
>
>  /**
> ---
>  drivers/gpu/drm/i915/intel_mocs.c | 159 ++++++++++++++++++++++++++++++++++++--
>  1 file changed, 154 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c
> index 92e461c68385..5236a442a14f 100644

Huh?  Duplicate patch follows?

> [...]

Arkadiusz Hiler April 27, 2017, 2:55 p.m. UTC | #2

On Wed, Apr 26, 2017 at 06:00:41PM +0300, David Weinehall wrote:
> Add a bunch of MOCS entries for gen 9 that were missing from intel_mocs.
> Some of these are used by media-sdk; if these entries are missing
> the default will instead be to do everything uncached.
> 
> This patch improves media-sdk performance with up to 60%
> with the (admittedly synthetic) benchmarks we use in our nightly
> testing, without regressing any other benchmarks.

Hey David,

I am testing some of the extended MOCS with Mesa and the differences I
see fit in the margins of statistical error.

Odd, I thought, so to make sure I haven't messed up anything in the
process of compiling, setting LD_LIBRARY_PATH and benchmarking I turned
everything to UNCACHED - and I saw severe performance drop.

So here is the question it induced:

Have you used the "closest neighbour" from entries available or did you
defaulted to the UNCACHED ones? That could be the culprit.

Note: I have tested MOCS for VB and Render Target only, and only in a
few synthetic cases - it will require much more fine-tuning and
benchmarking before any final conclusions.

David Weinehall April 27, 2017, 3:30 p.m. UTC | #3

On Thu, Apr 27, 2017 at 04:55:20PM +0200, Arkadiusz Hiler wrote:
> On Wed, Apr 26, 2017 at 06:00:41PM +0300, David Weinehall wrote:
> > Add a bunch of MOCS entries for gen 9 that were missing from intel_mocs.
> > Some of these are used by media-sdk; if these entries are missing
> > the default will instead be to do everything uncached.
> > 
> > This patch improves media-sdk performance with up to 60%
> > with the (admittedly synthetic) benchmarks we use in our nightly
> > testing, without regressing any other benchmarks.
> 
> Hey David,
> 
> I am testing some of the extended MOCS with Mesa and the differences I
> see fit in the margins of statistical error.
> 
> Odd, I thought, so to make sure I haven't messed up anything in the
> process of compiling, setting LD_LIBRARY_PATH and benchmarking I turned
> everything to UNCACHED - and I saw severe performance drop.
> 
> So here is the question it induced:
> 
> Have you used the "closest neighbour" from entries available or did you
> defaulted to the UNCACHED ones? That could be the culprit.
> 
> Note: I have tested MOCS for VB and Render Target only, and only in a
> few synthetic cases - it will require much more fine-tuning and
> benchmarking before any final conclusions.

As I mentioned in the commit message, the improvements only manifest
themselves for media-sdk workloads (and presumably other workloads
that uses the same hardware); if you see any performance regressions
with these additional entries I'd be interested to know.


Kind regards, David Weinehall

Chris Wilson April 27, 2017, 4:23 p.m. UTC | #4

On Thu, Apr 27, 2017 at 06:30:42PM +0300, David Weinehall wrote:
> On Thu, Apr 27, 2017 at 04:55:20PM +0200, Arkadiusz Hiler wrote:
> > On Wed, Apr 26, 2017 at 06:00:41PM +0300, David Weinehall wrote:
> > > Add a bunch of MOCS entries for gen 9 that were missing from intel_mocs.
> > > Some of these are used by media-sdk; if these entries are missing
> > > the default will instead be to do everything uncached.
> > > 
> > > This patch improves media-sdk performance with up to 60%
> > > with the (admittedly synthetic) benchmarks we use in our nightly
> > > testing, without regressing any other benchmarks.
> > 
> > Hey David,
> > 
> > I am testing some of the extended MOCS with Mesa and the differences I
> > see fit in the margins of statistical error.
> > 
> > Odd, I thought, so to make sure I haven't messed up anything in the
> > process of compiling, setting LD_LIBRARY_PATH and benchmarking I turned
> > everything to UNCACHED - and I saw severe performance drop.
> > 
> > So here is the question it induced:
> > 
> > Have you used the "closest neighbour" from entries available or did you
> > defaulted to the UNCACHED ones? That could be the culprit.
> > 
> > Note: I have tested MOCS for VB and Render Target only, and only in a
> > few synthetic cases - it will require much more fine-tuning and
> > benchmarking before any final conclusions.
> 
> As I mentioned in the commit message, the improvements only manifest
> themselves for media-sdk workloads (and presumably other workloads
> that uses the same hardware); if you see any performance regressions
> with these additional entries I'd be interested to know.

But what is being counter suggested is that their is no reason for these
mocs entries. If the sdk is just using mocs registers without first
programming them outside of the kernel abi, then it will be hitting
uncached memory - and then the only benefit is from simply enabling
cached access. The kernel ABI is minimalist for a reason, and we want to
know why we should be adding tables that we need to maintain forever
(bonus points for making that a consistent interface for hardware for
years to come).
-Chris

Arkadiusz Hiler May 4, 2017, 8:35 a.m. UTC | #5

On Thu, Apr 27, 2017 at 05:23:16PM +0100, Chris Wilson wrote:
> On Thu, Apr 27, 2017 at 06:30:42PM +0300, David Weinehall wrote:
> > On Thu, Apr 27, 2017 at 04:55:20PM +0200, Arkadiusz Hiler wrote:
> > > On Wed, Apr 26, 2017 at 06:00:41PM +0300, David Weinehall wrote:
> > > > Add a bunch of MOCS entries for gen 9 that were missing from intel_mocs.
> > > > Some of these are used by media-sdk; if these entries are missing
> > > > the default will instead be to do everything uncached.
> > > > 
> > > > This patch improves media-sdk performance with up to 60%
> > > > with the (admittedly synthetic) benchmarks we use in our nightly
> > > > testing, without regressing any other benchmarks.
> > > 
> > > Hey David,
> > > 
> > > I am testing some of the extended MOCS with Mesa and the differences I
> > > see fit in the margins of statistical error.
> > > 
> > > Odd, I thought, so to make sure I haven't messed up anything in the
> > > process of compiling, setting LD_LIBRARY_PATH and benchmarking I turned
> > > everything to UNCACHED - and I saw severe performance drop.
> > > 
> > > So here is the question it induced:
> > > 
> > > Have you used the "closest neighbour" from entries available or did you
> > > defaulted to the UNCACHED ones? That could be the culprit.
> > > 
> > > Note: I have tested MOCS for VB and Render Target only, and only in a
> > > few synthetic cases - it will require much more fine-tuning and
> > > benchmarking before any final conclusions.
> > 
> > As I mentioned in the commit message, the improvements only manifest
> > themselves for media-sdk workloads (and presumably other workloads
> > that uses the same hardware); if you see any performance regressions
> > with these additional entries I'd be interested to know.
> 
> But what is being counter suggested is that their is no reason for these
> mocs entries. If the sdk is just using mocs registers without first
> programming them outside of the kernel abi, then it will be hitting
> uncached memory - and then the only benefit is from simply enabling
> cached access. The kernel ABI is minimalist for a reason, and we want to
> know why we should be adding tables that we need to maintain forever
> (bonus points for making that a consistent interface for hardware for
> years to come).
> -Chris

Thanks for rephrasing - that's exactly what I am concerned with.

Did you just use the MediaSDK as it is - meaning that MOCS entries
beyond the set of the 3 we have defined had been naively utilized?

If that's the case it is probably the cause of the performance
difference - everything beyond "the 3" means UNCACHED.

Can you try changing MediaSDK to only use entries that are already in?
How the performance differs in that case?

Tvrtko Ursulin May 4, 2017, 8:53 a.m. UTC | #6

On 04/05/2017 09:35, Arkadiusz Hiler wrote:
> On Thu, Apr 27, 2017 at 05:23:16PM +0100, Chris Wilson wrote:
>> On Thu, Apr 27, 2017 at 06:30:42PM +0300, David Weinehall wrote:
>>> On Thu, Apr 27, 2017 at 04:55:20PM +0200, Arkadiusz Hiler wrote:
>>>> On Wed, Apr 26, 2017 at 06:00:41PM +0300, David Weinehall wrote:
>>>>> Add a bunch of MOCS entries for gen 9 that were missing from intel_mocs.
>>>>> Some of these are used by media-sdk; if these entries are missing
>>>>> the default will instead be to do everything uncached.
>>>>>
>>>>> This patch improves media-sdk performance with up to 60%
>>>>> with the (admittedly synthetic) benchmarks we use in our nightly
>>>>> testing, without regressing any other benchmarks.
>>>>
>>>> Hey David,
>>>>
>>>> I am testing some of the extended MOCS with Mesa and the differences I
>>>> see fit in the margins of statistical error.
>>>>
>>>> Odd, I thought, so to make sure I haven't messed up anything in the
>>>> process of compiling, setting LD_LIBRARY_PATH and benchmarking I turned
>>>> everything to UNCACHED - and I saw severe performance drop.
>>>>
>>>> So here is the question it induced:
>>>>
>>>> Have you used the "closest neighbour" from entries available or did you
>>>> defaulted to the UNCACHED ones? That could be the culprit.
>>>>
>>>> Note: I have tested MOCS for VB and Render Target only, and only in a
>>>> few synthetic cases - it will require much more fine-tuning and
>>>> benchmarking before any final conclusions.
>>>
>>> As I mentioned in the commit message, the improvements only manifest
>>> themselves for media-sdk workloads (and presumably other workloads
>>> that uses the same hardware); if you see any performance regressions
>>> with these additional entries I'd be interested to know.
>>
>> But what is being counter suggested is that their is no reason for these
>> mocs entries. If the sdk is just using mocs registers without first
>> programming them outside of the kernel abi, then it will be hitting
>> uncached memory - and then the only benefit is from simply enabling
>> cached access. The kernel ABI is minimalist for a reason, and we want to
>> know why we should be adding tables that we need to maintain forever
>> (bonus points for making that a consistent interface for hardware for
>> years to come).
>> -Chris
>
> Thanks for rephrasing - that's exactly what I am concerned with.
>
> Did you just use the MediaSDK as it is - meaning that MOCS entries
> beyond the set of the 3 we have defined had been naively utilized?
>
> If that's the case it is probably the cause of the performance
> difference - everything beyond "the 3" means UNCACHED.
>
> Can you try changing MediaSDK to only use entries that are already in?
> How the performance differs in that case?

Alternatively, at the time this was on my plate, Eero had suggested a 
sequence of experiments by basically gradually replicating the default 
UC/WB entries to currently empty slots, starting on GT2 parts and then 
going forward adding the more fine tuned parts.

This would have showed the benefit of fine tuned entries vs basic cached 
ones. Unfortunately I never got round doing this, but it sounded like a 
really good approach to me.

I could paste these suggestion here if Eero wouldn't mind? But I am also 
not sure if it is still relevant after the effort of exactly documenting 
the extended set of entries started.

Regards,

Tvrtko

Eero Tamminen May 4, 2017, 9:21 a.m. UTC | #7

Hi,

On 04.05.2017 11:53, Tvrtko Ursulin wrote:
> On 04/05/2017 09:35, Arkadiusz Hiler wrote:
>> On Thu, Apr 27, 2017 at 05:23:16PM +0100, Chris Wilson wrote:
>>> But what is being counter suggested is that their is no reason for these
>>> mocs entries. If the sdk is just using mocs registers without first
>>> programming them outside of the kernel abi, then it will be hitting
>>> uncached memory - and then the only benefit is from simply enabling
>>> cached access. The kernel ABI is minimalist for a reason, and we want to
>>> know why we should be adding tables that we need to maintain forever
>>> (bonus points for making that a consistent interface for hardware for
>>> years to come).
>>> -Chris
>>
>> Thanks for rephrasing - that's exactly what I am concerned with.
>>
>> Did you just use the MediaSDK as it is - meaning that MOCS entries
>> beyond the set of the 3 we have defined had been naively utilized?
>>
>> If that's the case it is probably the cause of the performance
>> difference - everything beyond "the 3" means UNCACHED.
>>
>> Can you try changing MediaSDK to only use entries that are already in?
>> How the performance differs in that case?
>
> Alternatively, at the time this was on my plate, Eero had suggested a
> sequence of experiments by basically gradually replicating the default
> UC/WB entries to currently empty slots, starting on GT2 parts and then
> going forward adding the more fine tuned parts.
>
> This would have showed the benefit of fine tuned entries vs basic cached
> ones. Unfortunately I never got round doing this, but it sounded like a
> really good approach to me.
>
> I could paste these suggestion here if Eero wouldn't mind?

Of course I don't mind. :-)


> But I am also
> not sure if it is still relevant after the effort of exactly documenting
> the extended set of entries started.

It's relevant in the sense that we don't currently don't know whether 
there's any actual benefit from the new entries (i.e. was it just an 
issue of VPG not using the correct existing entries).

If there is, that would be motivation to investigate impact of them also 
on other workloads.



	- Eero

Tvrtko Ursulin May 4, 2017, 9:34 a.m. UTC | #8

On 04/05/2017 10:21, Eero Tamminen wrote:
> Hi,
>
> On 04.05.2017 11:53, Tvrtko Ursulin wrote:
>> On 04/05/2017 09:35, Arkadiusz Hiler wrote:
>>> On Thu, Apr 27, 2017 at 05:23:16PM +0100, Chris Wilson wrote:
>>>> But what is being counter suggested is that their is no reason for
>>>> these
>>>> mocs entries. If the sdk is just using mocs registers without first
>>>> programming them outside of the kernel abi, then it will be hitting
>>>> uncached memory - and then the only benefit is from simply enabling
>>>> cached access. The kernel ABI is minimalist for a reason, and we
>>>> want to
>>>> know why we should be adding tables that we need to maintain forever
>>>> (bonus points for making that a consistent interface for hardware for
>>>> years to come).
>>>> -Chris
>>>
>>> Thanks for rephrasing - that's exactly what I am concerned with.
>>>
>>> Did you just use the MediaSDK as it is - meaning that MOCS entries
>>> beyond the set of the 3 we have defined had been naively utilized?
>>>
>>> If that's the case it is probably the cause of the performance
>>> difference - everything beyond "the 3" means UNCACHED.
>>>
>>> Can you try changing MediaSDK to only use entries that are already in?
>>> How the performance differs in that case?
>>
>> Alternatively, at the time this was on my plate, Eero had suggested a
>> sequence of experiments by basically gradually replicating the default
>> UC/WB entries to currently empty slots, starting on GT2 parts and then
>> going forward adding the more fine tuned parts.
>>
>> This would have showed the benefit of fine tuned entries vs basic cached
>> ones. Unfortunately I never got round doing this, but it sounded like a
>> really good approach to me.
>>
>> I could paste these suggestion here if Eero wouldn't mind?
>
> Of course I don't mind. :-)

Excellent, so here is what you wrote to me at that time:

------------------------------------------------------------------
You could start by putting first ED_UC line values to other ED_UC lines, 
and the first ED_WB line values to other ED_WB lines.

Then test that against standard kernel and VPG kernel on SKL GT2 
machine, to evaluate LLC settings.

If perf of that looks good, then test same settings also on SKL GT3e, or 
GT4e to evaluate impact of the more fine-tuned eLLC settings in addition 
to LLC ones.

If GT2 results don't look good, try using ED_WB line for all lines that 
have either ED_WB or L3_WB.

If if that doesn't look good either, try using ED_UC line for all lines 
that have either ED_UC or L3_UC.

And if even that fails to produce performance-wise good results, we can 
conclude that we need VPG kernel's fine-tuned MOCS settings are really 
needed.

Please provide some spreadsheet of the results you get.

(My guess is that that the first settings provide almost all of the 
available speedup on GT2, but with eDRAM things aren't that 
straightforward.)
------------------------------------------------------------------

>
>> But I am also
>> not sure if it is still relevant after the effort of exactly documenting
>> the extended set of entries started.
>
> It's relevant in the sense that we don't currently don't know whether
> there's any actual benefit from the new entries (i.e. was it just an
> issue of VPG not using the correct existing entries).
>
> If there is, that would be motivation to investigate impact of them also
> on other workloads.

There probably is a benefit since it is hard to imagine fine tuned 
entries would otherwise exist. But I agree it makes sense to get a 
complete understanding of relative contribution of individual fine tunings.

Regards,

Tvrtko

David Weinehall May 4, 2017, 2:47 p.m. UTC | #9

On Thu, May 04, 2017 at 10:35:33AM +0200, Arkadiusz Hiler wrote:
> On Thu, Apr 27, 2017 at 05:23:16PM +0100, Chris Wilson wrote:
> > On Thu, Apr 27, 2017 at 06:30:42PM +0300, David Weinehall wrote:
> > > On Thu, Apr 27, 2017 at 04:55:20PM +0200, Arkadiusz Hiler wrote:
> > > > On Wed, Apr 26, 2017 at 06:00:41PM +0300, David Weinehall wrote:
> > > > > Add a bunch of MOCS entries for gen 9 that were missing from intel_mocs.
> > > > > Some of these are used by media-sdk; if these entries are missing
> > > > > the default will instead be to do everything uncached.
> > > > > 
> > > > > This patch improves media-sdk performance with up to 60%
> > > > > with the (admittedly synthetic) benchmarks we use in our nightly
> > > > > testing, without regressing any other benchmarks.
> > > > 
> > > > Hey David,
> > > > 
> > > > I am testing some of the extended MOCS with Mesa and the differences I
> > > > see fit in the margins of statistical error.
> > > > 
> > > > Odd, I thought, so to make sure I haven't messed up anything in the
> > > > process of compiling, setting LD_LIBRARY_PATH and benchmarking I turned
> > > > everything to UNCACHED - and I saw severe performance drop.
> > > > 
> > > > So here is the question it induced:
> > > > 
> > > > Have you used the "closest neighbour" from entries available or did you
> > > > defaulted to the UNCACHED ones? That could be the culprit.
> > > > 
> > > > Note: I have tested MOCS for VB and Render Target only, and only in a
> > > > few synthetic cases - it will require much more fine-tuning and
> > > > benchmarking before any final conclusions.
> > > 
> > > As I mentioned in the commit message, the improvements only manifest
> > > themselves for media-sdk workloads (and presumably other workloads
> > > that uses the same hardware); if you see any performance regressions
> > > with these additional entries I'd be interested to know.
> > 
> > But what is being counter suggested is that their is no reason for these
> > mocs entries. If the sdk is just using mocs registers without first
> > programming them outside of the kernel abi, then it will be hitting
> > uncached memory - and then the only benefit is from simply enabling
> > cached access. The kernel ABI is minimalist for a reason, and we want to
> > know why we should be adding tables that we need to maintain forever
> > (bonus points for making that a consistent interface for hardware for
> > years to come).
> > -Chris
> 
> Thanks for rephrasing - that's exactly what I am concerned with.
> 
> Did you just use the MediaSDK as it is - meaning that MOCS entries
> beyond the set of the 3 we have defined had been naively utilized?
> 
> If that's the case it is probably the cause of the performance
> difference - everything beyond "the 3" means UNCACHED.
> 
> Can you try changing MediaSDK to only use entries that are already in?
> How the performance differs in that case?

We're benchmarking using upstream MediaSDK without changes, since that's
the only thing that's relevant. Customising benchmarks to get better
results isn't really an acceptable solution :)

Obviously fixing MediaSDK upstream is a different story, in case one of
the three pre-defined entries we have turns out to be the best possible
MOCS-settings for that workload.


Kind regards, David

Kenneth Graunke May 4, 2017, 4:51 p.m. UTC | #10

On Thursday, May 4, 2017 7:47:21 AM PDT David Weinehall wrote:
> On Thu, May 04, 2017 at 10:35:33AM +0200, Arkadiusz Hiler wrote:
> > Thanks for rephrasing - that's exactly what I am concerned with.
> > 
> > Did you just use the MediaSDK as it is - meaning that MOCS entries
> > beyond the set of the 3 we have defined had been naively utilized?
> > 
> > If that's the case it is probably the cause of the performance
> > difference - everything beyond "the 3" means UNCACHED.
> > 
> > Can you try changing MediaSDK to only use entries that are already in?
> > How the performance differs in that case?
> 
> We're benchmarking using upstream MediaSDK without changes, since that's
> the only thing that's relevant. Customising benchmarks to get better
> results isn't really an acceptable solution :)
> 
> Obviously fixing MediaSDK upstream is a different story, in case one of
> the three pre-defined entries we have turns out to be the best possible
> MOCS-settings for that workload.

You're right about customizing benchmarks, but...

MediaSDK is not a benchmark.  If I'm not mistaken, it's a userspace
driver produced by Intel engineers, one which Intel has the full
capability to change.  What you're saying is that Intel's MediaSDK
engineers are unwilling to change their software to provide better
performance for their Linux users.

That's pretty mental.

We don't warp the core operating system to work around userspace
software simply because they don't want to change it.

This isn't about open vs. closed or internal vs. public projects,
either.  I work on a public userspace driver for Intel graphics.
If I sent a kernel patch, the kernel developers would ask me the
exact same questions, to justify my new additions:

   1.  Is your userspace actually using all these new additions?
       If not, which ones are you using?

       They would ask me to drop anything I wasn't actually using
       yet, because speculatively adding things to the kernel that
       we have to maintain backwards compatibility for has caused
       both kernel and userspace developers a lot of trouble.

   2.  Are you sure that you need them all?  Is there a simpler
       solution - are some existing things good enough?  What's
       the additional benefit of each new addition?

I would have to answer these questions to the satisfaction of the
kernel developers before they would even consider taking my patch.

You keep pointing to your large performance improvement, but all it's
shown is that actually using the GPU cache is faster than having a
broken userspace driver explicitly set everything to uncached.  Many
people have pointed this out.

Arek and Tvrtko have good suggestions.  I don't think you're going
to get anywhere with this until you demonstrate that the new MOCS
entries provide some non-zero value over using the existing WB entry.

Here are a couple more data points:

1. We likely can't implement the documented "MOCS Version 1"
   table as is.

   The kernel exposes existing entries with specific semantics.
   Changing their meaning would introduce a backwards-incompatible
   change that would likely regress the performance of existing
   userspace.  This is almost certainly unacceptable - our customers,
   distro partners, users, and even people like Linus Torvalds will
   suffer and complain loudly.

   We could add the new entries at an offset - i.e. leave the existing
   3 entries, and append the rest after that.  But that would require
   changing userspace that assumes the Windows tables, such as MediaSDK
   (they would have to add 3 to their MOCS indexes).  At which point,
   we're changing them, so...the "runs unaltered" argument falls over.

2. The docs finally contain "recommended MOCS settings" - i.e. where
   to cache various types of objects, and at what age.  However, I
   believe those recommendations can be implemented with 1-2 new table
   entries and a PTE change to be eLLC-only by default.  Most of the
   table is completely unnecessary to implement the recommendations.

   I personally would like to try implementing their recommended
   settings in my driver.  I have not had time yet, but plan to try.

I'm very glad to see the Windows MOCS recommendations documented.
I'd been asking for that information for literally years.  If we'd
gotten it earlier, a lot of mess could have been avoided.  For future
platforms, we may want to coordinate and use the same table.  But
Gen9 has been shipping for ages, and we don't have that luxury.

--Ken

Ewins, Jon May 4, 2017, 8:32 p.m. UTC | #11

On 5/4/2017 9:51 AM, Kenneth Graunke wrote:
> On Thursday, May 4, 2017 7:47:21 AM PDT David Weinehall wrote:
>> On Thu, May 04, 2017 at 10:35:33AM +0200, Arkadiusz Hiler wrote:
>>> Thanks for rephrasing - that's exactly what I am concerned with.
>>>
>>> Did you just use the MediaSDK as it is - meaning that MOCS entries
>>> beyond the set of the 3 we have defined had been naively utilized?
>>>
>>> If that's the case it is probably the cause of the performance
>>> difference - everything beyond "the 3" means UNCACHED.
>>>
>>> Can you try changing MediaSDK to only use entries that are already in?
>>> How the performance differs in that case?
>> We're benchmarking using upstream MediaSDK without changes, since that's
>> the only thing that's relevant. Customising benchmarks to get better
>> results isn't really an acceptable solution :)
>>
>> Obviously fixing MediaSDK upstream is a different story, in case one of
>> the three pre-defined entries we have turns out to be the best possible
>> MOCS-settings for that workload.
> You're right about customizing benchmarks, but...
>
> MediaSDK is not a benchmark.  If I'm not mistaken, it's a userspace
> driver produced by Intel engineers, one which Intel has the full
> capability to change.  What you're saying is that Intel's MediaSDK
> engineers are unwilling to change their software to provide better
> performance for their Linux users.
>
> That's pretty mental.
>
> We don't warp the core operating system to work around userspace
> software simply because they don't want to change it.
Agreed, that isn't the intention.
> This isn't about open vs. closed or internal vs. public projects,
> either.  I work on a public userspace driver for Intel graphics.
> If I sent a kernel patch, the kernel developers would ask me the
> exact same questions, to justify my new additions:
>
>     1.  Is your userspace actually using all these new additions?
>         If not, which ones are you using?
>
>         They would ask me to drop anything I wasn't actually using
>         yet, because speculatively adding things to the kernel that
>         we have to maintain backwards compatibility for has caused
>         both kernel and userspace developers a lot of trouble.
>
>     2.  Are you sure that you need them all?  Is there a simpler
>         solution - are some existing things good enough?  What's
>         the additional benefit of each new addition?
>
> I would have to answer these questions to the satisfaction of the
> kernel developers before they would even consider taking my patch.
>
> You keep pointing to your large performance improvement, but all it's
> shown is that actually using the GPU cache is faster than having a
> broken userspace driver explicitly set everything to uncached.  Many
> people have pointed this out.
> Arek and Tvrtko have good suggestions.  I don't think you're going
> to get anywhere with this until you demonstrate that the new MOCS
> entries provide some non-zero value over using the existing WB entries
> Here are a couple more data points:
>
> 1. We likely can't implement the documented "MOCS Version 1"
>     table as is.
>
>     The kernel exposes existing entries with specific semantics.
>     Changing their meaning would introduce a backwards-incompatible
>     change that would likely regress the performance of existing
>     userspace.  This is almost certainly unacceptable - our customers,
>     distro partners, users, and even people like Linus Torvalds will
>     suffer and complain loudly.
>     
The way the existing entries are exposed should not be changing.
>     We could add the new entries at an offset - i.e. leave the existing
>     3 entries, and append the rest after that.  But that would require
>     changing userspace that assumes the Windows tables, such as MediaSDK
>     (they would have to add 3 to their MOCS indexes).  At which point,
>     we're changing them, so...the "runs unaltered" argument falls over.
The intention is certainly that the existing 3 entries already in use by 
open source user mode stack are not changed.
This is additive to what is there.  The BSpec should be reflecting that 
- if not that is an error.  The Intel UMDs will change to support this.
> 2. The docs finally contain "recommended MOCS settings" - i.e. where
>     to cache various types of objects, and at what age.  However, I
>     believe those recommendations can be implemented with 1-2 new table
>     entries and a PTE change to be eLLC-only by default.  Most of the
>     table is completely unnecessary to implement the recommendations.
>
>     I personally would like to try implementing their recommended
>     settings in my driver.  I have not had time yet, but plan to try.
>
> I'm very glad to see the Windows MOCS recommendations documented.
> I'd been asking for that information for literally years.  If we'd
> gotten it earlier, a lot of mess could have been avoided.  For future
> platforms, we may want to coordinate and use the same table.  But
> Gen9 has been shipping for ages, and we don't have that luxury.
I would hope that is not the case and the change can be made for gen9, 
as it should have no impact on existing user space as is, while the 
addition of the new entries can provide benefit.

> --Ken
>
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Rogozhkin, Dmitry V May 5, 2017, 2:46 a.m. UTC | #12

On 5/4/2017 9:51 AM, Kenneth Graunke wrote:
> MediaSDK is not a benchmark.  If I'm not mistaken, it's a userspace
> driver produced by Intel engineers, one which Intel has the full
> capability to change.  What you're saying is that Intel's MediaSDK
> engineers are unwilling to change their software to provide better
> performance for their Linux users.
>
> That's pretty mental.
You are mistaken. Media SDK is not a driver. It is a user space library 
which talks to the user space driver. And Media SDK does not set _any_ 
caching policies you are discussing here. That's the driver who sets 
these policies. I don't want to go further here who supports this 
driver, Intel or not, but there are mediasdk engineers whom you blame to 
not willing to do something and who actually only indirectly are related 
to this topic. Please, if you mean driver, say a driver.

Arkadiusz Hiler May 5, 2017, 11:31 a.m. UTC | #13

On Thu, May 04, 2017 at 07:46:34PM -0700, Dmitry Rogozhkin wrote:
> 
> 
> On 5/4/2017 9:51 AM, Kenneth Graunke wrote:
> > MediaSDK is not a benchmark.  If I'm not mistaken, it's a userspace
> > driver produced by Intel engineers, one which Intel has the full
> > capability to change.  What you're saying is that Intel's MediaSDK
> > engineers are unwilling to change their software to provide better
> > performance for their Linux users.
> > 
> > That's pretty mental.
> You are mistaken. Media SDK is not a driver. It is a user space library
> which talks to the user space driver. And Media SDK does not set _any_
> caching policies you are discussing here. That's the driver who sets these
> policies. I don't want to go further here who supports this driver, Intel or
> not, but there are mediasdk engineers whom you blame to not willing to do
> something and who actually only indirectly are related to this topic.
> Please, if you mean driver, say a driver.

You are right. It is very unfortunate that those two were confused.

To further clarify, here's an excerpt from MediaSDK's README.md:

"Intel Media SDK depends on a special version of LibVA which comes with
Intel Media Server Studio installation and is not in upstream, so this
version is not compatible with the LibVA/driver available at 01org."

Nonetheless, the main question still stands:

How big is the performance gap when using appropriate, existing entries
entries vs the entries proposed here?

Also a couple more questions arise:

* What about versioning schema? We definitely need a consumer for that.

* The libva you use is a **closed** UMD. It this enough of a reason to
  do the change (sans the versioning)? It starts to get blurry here.

Don't get me wrong, I've seen the spec and believe that a lot of hard
work was put into determining the entries and the usage scenarios, and
that they can benefit us.

I even have some patches ready[0] and I would love to get them upstream,
but they have to be tested in reproducible manner, with clear
methodology. They need to be discussed and deemed useful by providing
real value. We cannot simply relay on opaque claims that they are good
for us.

To do the above I am fiddling with Mesa - if we will see performance
boost, then this would provide both a solid reason to add the entries
and a consumer for the versioning API.

As of proprietary libva/MediaSDK combo I see at least three options to
do "the 3" vs "extended mocs" testing:

1. change the libva you use to utilize only the basic 3 entries only and
   do the testing - if you can change the source

2. change kernel and fill in entries with copies of the 3 basic ones and
   do the testing - will work even without the source code of libva

3. port the MOCS changes to 01org's libva and either use this version
   from MediaSDK for testing or use some other benchmarks -  this could
   be the consumer we need

I hope that this will move us forward.

[0]: https://github.com/ivyl/linux/commits/mocs

Chris Wilson May 5, 2017, 11:49 a.m. UTC | #14

On Fri, May 05, 2017 at 01:31:37PM +0200, Arkadiusz Hiler wrote:
> On Thu, May 04, 2017 at 07:46:34PM -0700, Dmitry Rogozhkin wrote:
> > 
> > 
> > On 5/4/2017 9:51 AM, Kenneth Graunke wrote:
> > > MediaSDK is not a benchmark.  If I'm not mistaken, it's a userspace
> > > driver produced by Intel engineers, one which Intel has the full
> > > capability to change.  What you're saying is that Intel's MediaSDK
> > > engineers are unwilling to change their software to provide better
> > > performance for their Linux users.
> > > 
> > > That's pretty mental.
> > You are mistaken. Media SDK is not a driver. It is a user space library
> > which talks to the user space driver. And Media SDK does not set _any_
> > caching policies you are discussing here. That's the driver who sets these
> > policies. I don't want to go further here who supports this driver, Intel or
> > not, but there are mediasdk engineers whom you blame to not willing to do
> > something and who actually only indirectly are related to this topic.
> > Please, if you mean driver, say a driver.
> 
> You are right. It is very unfortunate that those two were confused.
> 
> To further clarify, here's an excerpt from MediaSDK's README.md:
> 
> "Intel Media SDK depends on a special version of LibVA which comes with
> Intel Media Server Studio installation and is not in upstream, so this
> version is not compatible with the LibVA/driver available at 01org."
> 
> 
> Nonetheless, the main question still stands:
> 
> How big is the performance gap when using appropriate, existing entries
> entries vs the entries proposed here?
> 
> 
> Also a couple more questions arise:
> 
> * What about versioning schema? We definitely need a consumer for that.
> 
> * The libva you use is a **closed** UMD. It this enough of a reason to
>   do the change (sans the versioning)? It starts to get blurry here.
> 
> Don't get me wrong, I've seen the spec and believe that a lot of hard
> work was put into determining the entries and the usage scenarios, and
> that they can benefit us.
> 
> I even have some patches ready[0] and I would love to get them upstream,
> but they have to be tested in reproducible manner, with clear
> methodology. They need to be discussed and deemed useful by providing
> real value. We cannot simply relay on opaque claims that they are good
> for us.
> 
> 
> To do the above I am fiddling with Mesa - if we will see performance
> boost, then this would provide both a solid reason to add the entries
> and a consumer for the versioning API.
> 
> 
> As of proprietary libva/MediaSDK combo I see at least three options to
> do "the 3" vs "extended mocs" testing:
> 
> 1. change the libva you use to utilize only the basic 3 entries only and
>    do the testing - if you can change the source
> 
> 2. change kernel and fill in entries with copies of the 3 basic ones and
>    do the testing - will work even without the source code of libva
> 
> 3. port the MOCS changes to 01org's libva and either use this version
>    from MediaSDK for testing or use some other benchmarks -  this could
>    be the consumer we need
> 
> I hope that this will move us forward.

Michal (W or W) has some patches to allow contexts to set their own mocs,
which is definitely my preferred solution.
-Chris

Arkadiusz Hiler May 5, 2017, 12:53 p.m. UTC | #15

On Fri, May 05, 2017 at 12:49:49PM +0100, Chris Wilson wrote:
> On Fri, May 05, 2017 at 01:31:37PM +0200, Arkadiusz Hiler wrote:
> > On Thu, May 04, 2017 at 07:46:34PM -0700, Dmitry Rogozhkin wrote:
> > > 
> > > 
> > > On 5/4/2017 9:51 AM, Kenneth Graunke wrote:
> > > > MediaSDK is not a benchmark.  If I'm not mistaken, it's a userspace
> > > > driver produced by Intel engineers, one which Intel has the full
> > > > capability to change.  What you're saying is that Intel's MediaSDK
> > > > engineers are unwilling to change their software to provide better
> > > > performance for their Linux users.
> > > > 
> > > > That's pretty mental.
> > > You are mistaken. Media SDK is not a driver. It is a user space library
> > > which talks to the user space driver. And Media SDK does not set _any_
> > > caching policies you are discussing here. That's the driver who sets these
> > > policies. I don't want to go further here who supports this driver, Intel or
> > > not, but there are mediasdk engineers whom you blame to not willing to do
> > > something and who actually only indirectly are related to this topic.
> > > Please, if you mean driver, say a driver.
> > 
> > You are right. It is very unfortunate that those two were confused.
> > 
> > To further clarify, here's an excerpt from MediaSDK's README.md:
> > 
> > "Intel Media SDK depends on a special version of LibVA which comes with
> > Intel Media Server Studio installation and is not in upstream, so this
> > version is not compatible with the LibVA/driver available at 01org."
> > 
> > 
> > Nonetheless, the main question still stands:
> > 
> > How big is the performance gap when using appropriate, existing entries
> > entries vs the entries proposed here?
> > 
> > 
> > Also a couple more questions arise:
> > 
> > * What about versioning schema? We definitely need a consumer for that.
> > 
> > * The libva you use is a **closed** UMD. It this enough of a reason to
> >   do the change (sans the versioning)? It starts to get blurry here.
> > 
> > Don't get me wrong, I've seen the spec and believe that a lot of hard
> > work was put into determining the entries and the usage scenarios, and
> > that they can benefit us.
> > 
> > I even have some patches ready[0] and I would love to get them upstream,
> > but they have to be tested in reproducible manner, with clear
> > methodology. They need to be discussed and deemed useful by providing
> > real value. We cannot simply relay on opaque claims that they are good
> > for us.
> > 
> > 
> > To do the above I am fiddling with Mesa - if we will see performance
> > boost, then this would provide both a solid reason to add the entries
> > and a consumer for the versioning API.
> > 
> > 
> > As of proprietary libva/MediaSDK combo I see at least three options to
> > do "the 3" vs "extended mocs" testing:
> > 
> > 1. change the libva you use to utilize only the basic 3 entries only and
> >    do the testing - if you can change the source
> > 
> > 2. change kernel and fill in entries with copies of the 3 basic ones and
> >    do the testing - will work even without the source code of libva
> > 
> > 3. port the MOCS changes to 01org's libva and either use this version
> >    from MediaSDK for testing or use some other benchmarks -  this could
> >    be the consumer we need
> > 
> > I hope that this will move us forward.
> 
> Michal (W or W) has some patches to allow contexts to set their own mocs,
> which is definitely my preferred solution.
> -Chris

Michal Winiarski (CCed).

IIRC the series was only for render - other rings does not have
context-stored MOCS.

Kenneth Graunke May 5, 2017, 3:44 p.m. UTC | #16

On Thursday, May 4, 2017 7:46:34 PM PDT Dmitry Rogozhkin wrote:
> 
> On 5/4/2017 9:51 AM, Kenneth Graunke wrote:
> > MediaSDK is not a benchmark.  If I'm not mistaken, it's a userspace
> > driver produced by Intel engineers, one which Intel has the full
> > capability to change.  What you're saying is that Intel's MediaSDK
> > engineers are unwilling to change their software to provide better
> > performance for their Linux users.
> >
> > That's pretty mental.
> You are mistaken. Media SDK is not a driver. It is a user space library 
> which talks to the user space driver. And Media SDK does not set _any_ 
> caching policies you are discussing here. That's the driver who sets 
> these policies. I don't want to go further here who supports this 
> driver, Intel or not, but there are mediasdk engineers whom you blame to 
> not willing to do something and who actually only indirectly are related 
> to this topic. Please, if you mean driver, say a driver.

Sorry, that's my mistake - and I think a number of other people in the
thread were similarly confused.  So, the suggestion isn't to change
MediaSDK at all - it's to change the closed-source libva driver that's
setting MOCS values that aren't supported by the upstream kernel.  IIRC
the upstream libva-intel-driver does not have this bug.

My point largely stands, when redirected - someone is developing a
broken closed source userspace driver and is apparently unwilling to
change it.  That's the real problem.

Rogozhkin, Dmitry V May 5, 2017, 4:21 p.m. UTC | #17

On 5/5/2017 8:44 AM, Kenneth Graunke wrote:
> On Thursday, May 4, 2017 7:46:34 PM PDT Dmitry Rogozhkin wrote:
>> On 5/4/2017 9:51 AM, Kenneth Graunke wrote:
>>> MediaSDK is not a benchmark.  If I'm not mistaken, it's a userspace
>>> driver produced by Intel engineers, one which Intel has the full
>>> capability to change.  What you're saying is that Intel's MediaSDK
>>> engineers are unwilling to change their software to provide better
>>> performance for their Linux users.
>>>
>>> That's pretty mental.
>> You are mistaken. Media SDK is not a driver. It is a user space library
>> which talks to the user space driver. And Media SDK does not set _any_
>> caching policies you are discussing here. That's the driver who sets
>> these policies. I don't want to go further here who supports this
>> driver, Intel or not, but there are mediasdk engineers whom you blame to
>> not willing to do something and who actually only indirectly are related
>> to this topic. Please, if you mean driver, say a driver.
> Sorry, that's my mistake - and I think a number of other people in the
> thread were similarly confused.  So, the suggestion isn't to change
> MediaSDK at all - it's to change the closed-source libva driver that's
> setting MOCS values that aren't supported by the upstream kernel.  IIRC
> the upstream libva-intel-driver does not have this bug.
Mistakes happen. I hope this is clarified now.
>
> My point largely stands, when redirected - someone is developing a
> broken closed source userspace driver and is apparently unwilling to
> change it.  That's the real problem.
Broken? Have you ever attempted to run functional and performance 
competitive between closed source and open source user space drivers? I 
attempted and a number of others have attempted that. Result is that 
open source driver has significantly worse encoding quality, worse to 
the degree that any performance comparisons start to make no sense. 
(Yep, work in progress to try to fix that, I know.) Decoding quality is 
on par, but I saw cases when performance is 5-10% worse (and that's when 
both drivers work on their presumably optimal settings). So, "broken" 
closed source driver is in use for the _reason_. And which driver could 
be considered "broken" after that: closed source one or open source one?

So, why you are addressing that closed source driver is broken? If it 
will be put in the environment with the upstream kernel, then it will 
eventually be broken and that's what we are trying to fix here. But do 
you realize that in the production environment where the driver is 
intended to work there is a patched kernel mode driver in place with the 
MOCS settings to satisfy the need of the driver? Or you naively think we 
use non-modified KMD from the upstream or previously released versions 
from kernel.org. Bad guess! In the production environment driver is not 
broken.

MY ADVICE to everyone on the thread: topic is really hot, no clear path 
to deal with the problem is seen. Please, try to avoid addressing work 
done by others in the words: "unwilling", "broken", etc. This bothers 
people... Be easy...

There were suggestions in the thread to apply existing settings to the 
closed source driver and see how cool things will be. Well, we applied 
this settings: not cache anything policy and performance is worse 
comparing to the closed source solution. We have 3 other policies to 
try. Which one you want to try? Or you want to try combination? For 
which objects you suggest to try these combinations? If I recall 
correctly closed source driver has dozens, maybe close to hundred 
objects of different kind with different caching policies. You want to 
try all combinations? That's out of reality per my opinion... At least 
that can't happen within few days.

So, closed source driver _has_ settings it considers optimal. And yes, 
there is such a question how really optimal they are, but existing 
settings work, work well for few years already (in a closed source 
solution delivered to the customers) and work better comparing to open 
source solution. This solution is far to be considered broken. Open 
source solution is delivered to absolutely different set of customers, 
customers do not intersect, and I guess we can't consider open source 
solution broken as well. It satisfy certain customers segment. And there 
are possibilities to improve both solutions. Both groups can be blamed 
in unwillingness, but... wait a minute... after all here are both groups 
in the thread discussing this topic, so are we willing or unwilling to 
change?

I personally think that optimal solution would be:
1) Either an API to permit drivers to program settings they consider 
optimal and avoid all this discussion altogether.
2) or remove this MOCS table from the SW level completely and hold it on 
GPU side hidden from the SW stack (yep, not possible for current HW)
As for the solution#1, I afraid we have technical issues, right? Is that 
true that only Render contexts can hold settings table and that's not 
possible for other context types due to HW limitations? At what cost we 
would be able to support use cases with few drivers working at the same 
time and each trying to update MOCS settings for the global contexts?

Dmitry.

Kenneth Graunke May 5, 2017, 7:36 p.m. UTC | #18

On Friday, May 5, 2017 9:21:54 AM PDT Dmitry Rogozhkin wrote:
> > My point largely stands, when redirected - someone is developing a
> > broken closed source userspace driver and is apparently unwilling to
> > change it.  That's the real problem.
> Broken? Have you ever attempted to run functional and performance 
> competitive between closed source and open source user space drivers? I 
> attempted and a number of others have attempted that. Result is that 
> open source driver has significantly worse encoding quality, worse to 
> the degree that any performance comparisons start to make no sense. 
> (Yep, work in progress to try to fix that, I know.) Decoding quality is 
> on par, but I saw cases when performance is 5-10% worse (and that's when 
> both drivers work on their presumably optimal settings). So, "broken" 
> closed source driver is in use for the _reason_. And which driver could 
> be considered "broken" after that: closed source one or open source one?

I'm not in any way arguing that one driver is superior to the other,
nor that anyone should do performance comparisons between the two.

> So, why you are addressing that closed source driver is broken? If it 
> will be put in the environment with the upstream kernel, then it will 
> eventually be broken and that's what we are trying to fix here. But do 
> you realize that in the production environment where the driver is 
> intended to work there is a patched kernel mode driver in place with the 
> MOCS settings to satisfy the need of the driver? Or you naively think we 
> use non-modified KMD from the upstream or previously released versions 
> from kernel.org. Bad guess! In the production environment driver is not 
> broken.

Yes, I'm aware that the closed source userspace relies on a patched
non-upstream kernel, and that it's being used in some environments.
Presumably it works just fine there.

Isn't the goal to make that userspace driver run on the upstream Linux
kernel, so people can use it in places other than the environment where
it currently exists?  Would it not make sense to have it run reasonably
well on upstream kernels that are currently shipping?

On released upstream kernels, there are only three MOCS entries: UC,
PTE, and WB.  Using any other entries is ill-advised (even broken),
not only because they currently correspond to UC (leading to poor
performance), but because they may change meaning in the future.

Future upstream kernels may add new entries, and presumably would
advertise that via a getparam (i.e. I915_PARAM_MOCS_TABLE_VERSION).

The suggestion is to make your userspace driver use entry 2 (WB)
for anything you want cached, when running on an upstream kernel
(but keep using the existing entries on the patched kernel).  That
would perform much better than it currently does.  You probably won't
get the full 60%, but perhaps you'll get 50%.

Then, add any additional entries you want to the kernel, advertise
it via a GETPARAM, and make your driver use those new entries when
they are supported.  Benchmark.  See how much faster your workload
is with the new entries (compared to WB-for-everything).  That's the
number everyone here wants to see.

This should be a trivial amount of work - nobody's asking for any
combinatorial explosion of testing.  Doing this exercise also means
that your software would perform better on currently shipping upstream
kernels (arguably improving the driver) - and once you have the full set
of entries, it will perform even better.

Does that make sense?

--Ken

Daniel Vetter May 8, 2017, 8:09 a.m. UTC | #19

On Wed, Apr 26, 2017 at 06:00:41PM +0300, David Weinehall wrote:
> Add a bunch of MOCS entries for gen 9 that were missing from intel_mocs.
> Some of these are used by media-sdk; if these entries are missing
> the default will instead be to do everything uncached.
> 
> This patch improves media-sdk performance with up to 60%
> with the (admittedly synthetic) benchmarks we use in our nightly
> testing, without regressing any other benchmarks.
> 
> Signed-off-by: David Weinehall <david.weinehall@linux.intel.com>

Hm I guess this discussion is sprawling a bit too much, so top-level
summary:
- this is kernel/userspace api
- which means we must have the corresponding userspace as open-source
  patches proposed to be merged into the corresponding upstream repos
  (libva-intel it seems here, since this is for media).

Without that this patch is 100% moot for upstream consideration.

I also assume that as soon as we'll have the userspace side open and
published for review, all the questions around GETPARAM and how to
allocate MOCS entries will be much easier to understand and make a
meaningful dicision. As-is, half of the picture is flat-out missing, any
decision we might is pretty much guaranteed to be bogus. Which
incidentally is why upstream wants both sides of the story as open-source.

Thanks, Daniel

> 
> diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c
> index 92e461c68385..5236a442a14f 100644
> --- a/drivers/gpu/drm/i915/intel_mocs.c
> +++ b/drivers/gpu/drm/i915/intel_mocs.c
> @@ -103,7 +103,6 @@ static const struct drm_i915_mocs_entry skylake_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0010 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
>  	},
> @@ -123,7 +122,133 @@ static const struct drm_i915_mocs_entry skylake_mocs_table[] = {
>  			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
>  	  /* 0x0030 */
> -	  .l3cc_value =   L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000037 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000039 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000033 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000017 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000037 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000039 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000017 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x0000003b */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000033 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000019 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000003 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x0000001b */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x0000001b */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000013 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
>  	},
>  };
> 
> @@ -135,7 +260,6 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0010 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
>  	},
> @@ -145,7 +269,6 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0030 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
>  	},
> @@ -155,10 +278,36 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0030 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
>  	},
> +	{
> +	  /* 0x00000005 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000001 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000005 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
>  };
> 
>  /**
> ---
>  drivers/gpu/drm/i915/intel_mocs.c | 159 ++++++++++++++++++++++++++++++++++++--
>  1 file changed, 154 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_mocs.c b/drivers/gpu/drm/i915/intel_mocs.c
> index 92e461c68385..5236a442a14f 100644
> --- a/drivers/gpu/drm/i915/intel_mocs.c
> +++ b/drivers/gpu/drm/i915/intel_mocs.c
> @@ -103,7 +103,6 @@ static const struct drm_i915_mocs_entry skylake_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0010 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
>  	},
> @@ -123,7 +122,133 @@ static const struct drm_i915_mocs_entry skylake_mocs_table[] = {
>  			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
>  	  /* 0x0030 */
> -	  .l3cc_value =   L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000037 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000039 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000033 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000017 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000037 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000039 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000017 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x0000003b */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000033 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000019 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000003 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x0000001b */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x0000001b */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000013 */
> +	  .control_value = LE_CACHEABILITY(LE_WB) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(1) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
>  	},
>  };
>  
> @@ -135,7 +260,6 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0010 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
>  	},
> @@ -145,7 +269,6 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0030 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
>  	},
> @@ -155,10 +278,36 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = {
>  			   LE_TGT_CACHE(LE_TC_LLC_ELLC) |
>  			   LE_LRUM(3) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
>  			   LE_PFM(0) | LE_SCF(0),
> -
>  	  /* 0x0030 */
>  	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
>  	},
> +	{
> +	  /* 0x00000005 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0030 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_WB),
> +	},
> +	{
> +	  /* 0x00000001 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_PAGETABLE) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
> +	{
> +	  /* 0x00000005 */
> +	  .control_value = LE_CACHEABILITY(LE_UC) |
> +			   LE_TGT_CACHE(LE_TC_LLC) |
> +			   LE_LRUM(0) | LE_AOM(0) | LE_RSC(0) | LE_SCC(0) |
> +			   LE_PFM(0) | LE_SCF(0),
> +	  /* 0x0010 */
> +	  .l3cc_value =    L3_ESC(0) | L3_SCC(0) | L3_CACHEABILITY(L3_UC),
> +	},
>  };
>  
>  /**
> -- 
> 2.11.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

drm/i915: Update MOCS settings for gen 9

Commit Message

Comments

Patch