diff mbox series

[RFC,1/2] drm/dp: Make number of AUX retries configurable by display drivers.

Message ID 20210210083338.100068-1-khaled.almahallawy@intel.com (mailing list archive)
State New, archived
Headers show
Series [RFC,1/2] drm/dp: Make number of AUX retries configurable by display drivers. | expand

Commit Message

Almahallawy, Khaled Feb. 10, 2021, 8:33 a.m. UTC
The number of AUX retries specified in the DP specs is 7. Currently, to make Dell 4k monitors happier, the number of retries are 32.
i915 also retries 5 times (intel_dp_aux_xfer) which means in the case of AUX timeout we actually retries 32 * 5 = 160 times.

So making the number of aux retires a variable to allow for fine tuning and optimization of aux timing.

Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com>
---
 drivers/gpu/drm/drm_dp_helper.c | 10 +++-------
 include/drm/drm_dp_helper.h     |  1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

Comments

Thomas Zimmermann Feb. 10, 2021, 8:55 a.m. UTC | #1
Hi

Am 10.02.21 um 09:33 schrieb Khaled Almahallawy:
> The number of AUX retries specified in the DP specs is 7. Currently, to make Dell 4k monitors happier, the number of retries are 32.
> i915 also retries 5 times (intel_dp_aux_xfer) which means in the case of AUX timeout we actually retries 32 * 5 = 160 times.
> 
> So making the number of aux retires a variable to allow for fine tuning and optimization of aux timing.
> 
> Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com>
> ---
>   drivers/gpu/drm/drm_dp_helper.c | 10 +++-------
>   include/drm/drm_dp_helper.h     |  1 +
>   2 files changed, 4 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_dp_helper.c b/drivers/gpu/drm/drm_dp_helper.c
> index eedbb48815b7..8fdf57b4a06c 100644
> --- a/drivers/gpu/drm/drm_dp_helper.c
> +++ b/drivers/gpu/drm/drm_dp_helper.c
> @@ -249,13 +249,7 @@ static int drm_dp_dpcd_access(struct drm_dp_aux *aux, u8 request,
>   
>   	mutex_lock(&aux->hw_mutex);
>   
> -	/*
> -	 * The specification doesn't give any recommendation on how often to
> -	 * retry native transactions. We used to retry 7 times like for
> -	 * aux i2c transactions but real world devices this wasn't
> -	 * sufficient, bump to 32 which makes Dell 4k monitors happier.
> -	 */
> -	for (retry = 0; retry < 32; retry++) {
> +	for (retry = 0; retry < aux->num_retries; retry++) {
>   		if (ret != 0 && ret != -ETIMEDOUT) {
>   			usleep_range(AUX_RETRY_INTERVAL,
>   				     AUX_RETRY_INTERVAL + 100);
> @@ -1744,6 +1738,8 @@ void drm_dp_aux_init(struct drm_dp_aux *aux)
>   	aux->ddc.retries = 3;
>   
>   	aux->ddc.lock_ops = &drm_dp_i2c_lock_ops;
> +	/*Still making the Dell 4k monitors happier*/

The original comment was helpful; this one isn't.

Besides that, what problem does this patchset address? Too much probing? 
If I connect a Dell monitor to an Intel card, how often does it have to 
probe?

Best regards
Thomas

> +	aux->num_retries = 32;
>   }
>   EXPORT_SYMBOL(drm_dp_aux_init);
>   
> diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
> index edffd1dcca3e..16cbfc8f5e66 100644
> --- a/include/drm/drm_dp_helper.h
> +++ b/include/drm/drm_dp_helper.h
> @@ -1876,6 +1876,7 @@ struct drm_dp_aux {
>   	struct mutex hw_mutex;
>   	struct work_struct crc_work;
>   	u8 crc_count;
> +	int num_retries;
>   	ssize_t (*transfer)(struct drm_dp_aux *aux,
>   			    struct drm_dp_aux_msg *msg);
>   	/**
>
Almahallawy, Khaled Feb. 10, 2021, 10:16 a.m. UTC | #2
On Wed, 2021-02-10 at 09:55 +0100, Thomas Zimmermann wrote:
> Hi
> 
> Am 10.02.21 um 09:33 schrieb Khaled Almahallawy:
> > The number of AUX retries specified in the DP specs is 7.
> > Currently, to make Dell 4k monitors happier, the number of retries
> > are 32.
> > i915 also retries 5 times (intel_dp_aux_xfer) which means in the
> > case of AUX timeout we actually retries 32 * 5 = 160 times.
> > 
> > So making the number of aux retires a variable to allow for fine
> > tuning and optimization of aux timing.
> > 
> > Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com>
> > ---
> >   drivers/gpu/drm/drm_dp_helper.c | 10 +++-------
> >   include/drm/drm_dp_helper.h     |  1 +
> >   2 files changed, 4 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_dp_helper.c
> > b/drivers/gpu/drm/drm_dp_helper.c
> > index eedbb48815b7..8fdf57b4a06c 100644
> > --- a/drivers/gpu/drm/drm_dp_helper.c
> > +++ b/drivers/gpu/drm/drm_dp_helper.c
> > @@ -249,13 +249,7 @@ static int drm_dp_dpcd_access(struct
> > drm_dp_aux *aux, u8 request,
> >   
> >   	mutex_lock(&aux->hw_mutex);
> >   
> > -	/*
> > -	 * The specification doesn't give any recommendation on how
> > often to
> > -	 * retry native transactions. We used to retry 7 times like for
> > -	 * aux i2c transactions but real world devices this wasn't
> > -	 * sufficient, bump to 32 which makes Dell 4k monitors happier.
> > -	 */
> > -	for (retry = 0; retry < 32; retry++) {
> > +	for (retry = 0; retry < aux->num_retries; retry++) {
> >   		if (ret != 0 && ret != -ETIMEDOUT) {
> >   			usleep_range(AUX_RETRY_INTERVAL,
> >   				     AUX_RETRY_INTERVAL + 100);
> > @@ -1744,6 +1738,8 @@ void drm_dp_aux_init(struct drm_dp_aux *aux)
> >   	aux->ddc.retries = 3;
> >   
> >   	aux->ddc.lock_ops = &drm_dp_i2c_lock_ops;
> > +	/*Still making the Dell 4k monitors happier*/
> 
> The original comment was helpful; this one isn't.

Noted and I apologize for the comment. Was just copy/past original
comment. 
> 

> Besides that, what problem does this patchset address? Too much
> probing? 

The problem mainly with disconnect. When disconnecting, AUX read will
fail because of timing out. The 32 retries cause the disconnect flow
especially for Type-C/TBT Docks with MST hubs taking a long time.
Just trying to reduce this disconnect time. In addition as I noted,
i915 does retry in addition to retries by this function. 

Currently this function retry for 4 AUX situations:
AUX_NAK, AUX_DEFER, I/O error reported by driver and AUX timeout.

> If I connect a Dell monitor to an Intel card, how often does it have
> to 
> probe?

I guess it depends on how you connect (direct, behind MST hub, behind DOCK with MST). But I believe the most time consuming is the disconnect flow. 

Thanks
Khaled
> 
> Best regards
> Thomas
> 
> > +	aux->num_retries = 32;
> >   }
> >   EXPORT_SYMBOL(drm_dp_aux_init);
> >   
> > diff --git a/include/drm/drm_dp_helper.h
> > b/include/drm/drm_dp_helper.h
> > index edffd1dcca3e..16cbfc8f5e66 100644
> > --- a/include/drm/drm_dp_helper.h
> > +++ b/include/drm/drm_dp_helper.h
> > @@ -1876,6 +1876,7 @@ struct drm_dp_aux {
> >   	struct mutex hw_mutex;
> >   	struct work_struct crc_work;
> >   	u8 crc_count;
> > +	int num_retries;
> >   	ssize_t (*transfer)(struct drm_dp_aux *aux,
> >   			    struct drm_dp_aux_msg *msg);
> >   	/**
> >
Lyude Paul Feb. 10, 2021, 6:03 p.m. UTC | #3
On Wed, 2021-02-10 at 00:33 -0800, Khaled Almahallawy wrote:
> The number of AUX retries specified in the DP specs is 7. Currently, to make
> Dell 4k monitors happier, the number of retries are 32.
> i915 also retries 5 times (intel_dp_aux_xfer) which means in the case of AUX
> timeout we actually retries 32 * 5 = 160 times.

Is there any good reason for i915 to actually be doing retries itself? It seems
like an easier solution here might just to be to fix i915 so it doesn't retry,
and just rely on DRM to do the retries as appropriate.

That being said though, I'm open to this if there is a legitimate reason for
i915 to be handling retries on its own

> 
> So making the number of aux retires a variable to allow for fine tuning and
> optimization of aux timing.
> 
> Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com>
> ---
>  drivers/gpu/drm/drm_dp_helper.c | 10 +++-------
>  include/drm/drm_dp_helper.h     |  1 +
>  2 files changed, 4 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_dp_helper.c b/drivers/gpu/drm/drm_dp_helper.c
> index eedbb48815b7..8fdf57b4a06c 100644
> --- a/drivers/gpu/drm/drm_dp_helper.c
> +++ b/drivers/gpu/drm/drm_dp_helper.c
> @@ -249,13 +249,7 @@ static int drm_dp_dpcd_access(struct drm_dp_aux *aux, u8
> request,
>  
>         mutex_lock(&aux->hw_mutex);
>  
> -       /*
> -        * The specification doesn't give any recommendation on how often to
> -        * retry native transactions. We used to retry 7 times like for
> -        * aux i2c transactions but real world devices this wasn't
> -        * sufficient, bump to 32 which makes Dell 4k monitors happier.
> -        */
> -       for (retry = 0; retry < 32; retry++) {
> +       for (retry = 0; retry < aux->num_retries; retry++) {
>                 if (ret != 0 && ret != -ETIMEDOUT) {
>                         usleep_range(AUX_RETRY_INTERVAL,
>                                      AUX_RETRY_INTERVAL + 100);
> @@ -1744,6 +1738,8 @@ void drm_dp_aux_init(struct drm_dp_aux *aux)
>         aux->ddc.retries = 3;
>  
>         aux->ddc.lock_ops = &drm_dp_i2c_lock_ops;
> +       /*Still making the Dell 4k monitors happier*/
> +       aux->num_retries = 32;
>  }
>  EXPORT_SYMBOL(drm_dp_aux_init);
>  
> diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
> index edffd1dcca3e..16cbfc8f5e66 100644
> --- a/include/drm/drm_dp_helper.h
> +++ b/include/drm/drm_dp_helper.h
> @@ -1876,6 +1876,7 @@ struct drm_dp_aux {
>         struct mutex hw_mutex;
>         struct work_struct crc_work;
>         u8 crc_count;
> +       int num_retries;
>         ssize_t (*transfer)(struct drm_dp_aux *aux,
>                             struct drm_dp_aux_msg *msg);
>         /**
Almahallawy, Khaled Feb. 11, 2021, 6:56 a.m. UTC | #4
On Wed, 2021-02-10 at 13:03 -0500, Lyude Paul wrote:
> On Wed, 2021-02-10 at 00:33 -0800, Khaled Almahallawy wrote:
> > The number of AUX retries specified in the DP specs is 7.
> > Currently, to make
> > Dell 4k monitors happier, the number of retries are 32.
> > i915 also retries 5 times (intel_dp_aux_xfer) which means in the
> > case of AUX
> > timeout we actually retries 32 * 5 = 160 times.
> 
> Is there any good reason for i915 to actually be doing retries
> itself? It seems
> like an easier solution here might just to be to fix i915 so it
> doesn't retry,
> and just rely on DRM to do the retries as appropriate.
> 
> That being said though, I'm open to this if there is a legitimate
> reason for
> i915 to be handling retries on its own

i915 or others may benefit from controlling AUX retries to optimize and
minimize timing spent on the aux transactions.
 
drm_dpcd_access handles multiple aux retries cases the same way (retry
32 times at worst case). The 4 cases are:
1- *AUX receive or IO error*. I believe it is up to specific
implementation/HW once it detects a receive error to retry based on
their internal understanding using the timeout appropriate to the HW
capabilities.
 
2- *AUX Timeout* As the driver follows the specs for the recommended
timeout timer as defined in section  (2.11.2 AUX Transaction
Response/Reply Timeouts), the driver has more intelligence to know how
many retries it needs. This is more useful in case of DP-ALT if some
issues happen in Type-C stack that cause AUX timeout (e.g. USB3 dock
detected as high speed (USB2) causing SBU/AUX lines to be disabled).
Also the disconnect of MST hub/docks is dependent how fast a userspace
disconnect all MST connectors.  Not all user spaces do it as fast like
Chrome (ubuntu is an example): 
https://chromium-review.googlesource.com/c/chromium/src/+/2512550/  

3- *AUX_NACK* DP spec mentions retry 3 times for NACK(2.16.1.3 GTC Lock
Acquisition). However, sometimes we really don’t need any retry for
NACK, if DPRX replied AUX_NACK for DPCD that it doesn’t support (e.g.
reading LTTPR Capability and ID Field at DPCD F0000h ~ F0007).

4- *AUX_DEFER* The specs stated we may retry 7 times on AUX_DEFER
(3.5.1.2.3 LANEx_CHANNEL_EQ_DONE Sequence) and may terminate LT. Also
with the increased usage of USB4, TBT/Type-C Docks/Displays, and active
cables, the use of LTTPR becomes common which adds more challenge
especially if we have multiple LTTPRs and all operate in non-LTTPR
mode. In this case all LTTPRs will reply AUX_DEFER in 300us if it did
not receive any aux response from other LTTPRs and DPRX. The SCR states
we need to retry 7 times and not more than 50ms (DP v2.0 SCR on 8b/10b
DPTX and LTTPR Requirements Update to Section 3.6)

In addition I believe this function is not correct in treating
AUX_DEFER and AUX_NACK as -EIO. Especially for AUX_DEFER, it is a valid
1 byte response that can provide a valuable debugging information
especially in the case of on-board LTTPR where there is no way to
capture this AUX response between the SoC and LTTPR unless you solder
two wires on AUX_P and AUX_N lines and use i2c/aux analyzer to decode.
At least we should provide the same debug information as we do in
drm_dp_i2c_do_msg by printing AUX_DEFER and AUX_NACK.

Thank you for your feedback and review.

--Khaled
> 
> > So making the number of aux retires a variable to allow for fine
> > tuning and
> > optimization of aux timing.
> > 
> > Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com>
> > ---
> >  drivers/gpu/drm/drm_dp_helper.c | 10 +++-------
> >  include/drm/drm_dp_helper.h     |  1 +
> >  2 files changed, 4 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_dp_helper.c
> > b/drivers/gpu/drm/drm_dp_helper.c
> > index eedbb48815b7..8fdf57b4a06c 100644
> > --- a/drivers/gpu/drm/drm_dp_helper.c
> > +++ b/drivers/gpu/drm/drm_dp_helper.c
> > @@ -249,13 +249,7 @@ static int drm_dp_dpcd_access(struct
> > drm_dp_aux *aux, u8
> > request,
> >  
> >         mutex_lock(&aux->hw_mutex);
> >  
> > -       /*
> > -        * The specification doesn't give any recommendation on how
> > often to
> > -        * retry native transactions. We used to retry 7 times like
> > for
> > -        * aux i2c transactions but real world devices this wasn't
> > -        * sufficient, bump to 32 which makes Dell 4k monitors
> > happier.
> > -        */
> > -       for (retry = 0; retry < 32; retry++) {
> > +       for (retry = 0; retry < aux->num_retries; retry++) {
> >                 if (ret != 0 && ret != -ETIMEDOUT) {
> >                         usleep_range(AUX_RETRY_INTERVAL,
> >                                      AUX_RETRY_INTERVAL + 100);
> > @@ -1744,6 +1738,8 @@ void drm_dp_aux_init(struct drm_dp_aux *aux)
> >         aux->ddc.retries = 3;
> >  
> >         aux->ddc.lock_ops = &drm_dp_i2c_lock_ops;
> > +       /*Still making the Dell 4k monitors happier*/
> > +       aux->num_retries = 32;
> >  }
> >  EXPORT_SYMBOL(drm_dp_aux_init);
> >  
> > diff --git a/include/drm/drm_dp_helper.h
> > b/include/drm/drm_dp_helper.h
> > index edffd1dcca3e..16cbfc8f5e66 100644
> > --- a/include/drm/drm_dp_helper.h
> > +++ b/include/drm/drm_dp_helper.h
> > @@ -1876,6 +1876,7 @@ struct drm_dp_aux {
> >         struct mutex hw_mutex;
> >         struct work_struct crc_work;
> >         u8 crc_count;
> > +       int num_retries;
> >         ssize_t (*transfer)(struct drm_dp_aux *aux,
> >                             struct drm_dp_aux_msg *msg);
> >         /**
Lyude Paul Feb. 12, 2021, 12:51 a.m. UTC | #5
(JFYI I have seen this email, but haven't gotten a chance to actually read
through it yet. I should have the time to do so tomorrow)

On Thu, 2021-02-11 at 06:56 +0000, Almahallawy, Khaled wrote:
> On Wed, 2021-02-10 at 13:03 -0500, Lyude Paul wrote:
> > On Wed, 2021-02-10 at 00:33 -0800, Khaled Almahallawy wrote:
> > > The number of AUX retries specified in the DP specs is 7.
> > > Currently, to make
> > > Dell 4k monitors happier, the number of retries are 32.
> > > i915 also retries 5 times (intel_dp_aux_xfer) which means in the
> > > case of AUX
> > > timeout we actually retries 32 * 5 = 160 times.
> > 
> > Is there any good reason for i915 to actually be doing retries
> > itself? It seems
> > like an easier solution here might just to be to fix i915 so it
> > doesn't retry,
> > and just rely on DRM to do the retries as appropriate.
> > 
> > That being said though, I'm open to this if there is a legitimate
> > reason for
> > i915 to be handling retries on its own
> 
> i915 or others may benefit from controlling AUX retries to optimize and
> minimize timing spent on the aux transactions.
>  
> drm_dpcd_access handles multiple aux retries cases the same way (retry
> 32 times at worst case). The 4 cases are:
> 1- *AUX receive or IO error*. I believe it is up to specific
> implementation/HW once it detects a receive error to retry based on
> their internal understanding using the timeout appropriate to the HW
> capabilities.
>  
> 2- *AUX Timeout* As the driver follows the specs for the recommended
> timeout timer as defined in section  (2.11.2 AUX Transaction
> Response/Reply Timeouts), the driver has more intelligence to know how
> many retries it needs. This is more useful in case of DP-ALT if some
> issues happen in Type-C stack that cause AUX timeout (e.g. USB3 dock
> detected as high speed (USB2) causing SBU/AUX lines to be disabled).
> Also the disconnect of MST hub/docks is dependent how fast a userspace
> disconnect all MST connectors.  Not all user spaces do it as fast like
> Chrome (ubuntu is an example): 
> https://chromium-review.googlesource.com/c/chromium/src/+/2512550/  
> 
> 3- *AUX_NACK* DP spec mentions retry 3 times for NACK(2.16.1.3 GTC Lock
> Acquisition). However, sometimes we really don’t need any retry for
> NACK, if DPRX replied AUX_NACK for DPCD that it doesn’t support (e.g.
> reading LTTPR Capability and ID Field at DPCD F0000h ~ F0007).
> 
> 4- *AUX_DEFER* The specs stated we may retry 7 times on AUX_DEFER
> (3.5.1.2.3 LANEx_CHANNEL_EQ_DONE Sequence) and may terminate LT. Also
> with the increased usage of USB4, TBT/Type-C Docks/Displays, and active
> cables, the use of LTTPR becomes common which adds more challenge
> especially if we have multiple LTTPRs and all operate in non-LTTPR
> mode. In this case all LTTPRs will reply AUX_DEFER in 300us if it did
> not receive any aux response from other LTTPRs and DPRX. The SCR states
> we need to retry 7 times and not more than 50ms (DP v2.0 SCR on 8b/10b
> DPTX and LTTPR Requirements Update to Section 3.6)
> 
> In addition I believe this function is not correct in treating
> AUX_DEFER and AUX_NACK as -EIO. Especially for AUX_DEFER, it is a valid
> 1 byte response that can provide a valuable debugging information
> especially in the case of on-board LTTPR where there is no way to
> capture this AUX response between the SoC and LTTPR unless you solder
> two wires on AUX_P and AUX_N lines and use i2c/aux analyzer to decode.
> At least we should provide the same debug information as we do in
> drm_dp_i2c_do_msg by printing AUX_DEFER and AUX_NACK.
> 
> Thank you for your feedback and review.
> 
> --Khaled
> > 
> > > So making the number of aux retires a variable to allow for fine
> > > tuning and
> > > optimization of aux timing.
> > > 
> > > Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com>
> > > ---
> > >  drivers/gpu/drm/drm_dp_helper.c | 10 +++-------
> > >  include/drm/drm_dp_helper.h     |  1 +
> > >  2 files changed, 4 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/drm_dp_helper.c
> > > b/drivers/gpu/drm/drm_dp_helper.c
> > > index eedbb48815b7..8fdf57b4a06c 100644
> > > --- a/drivers/gpu/drm/drm_dp_helper.c
> > > +++ b/drivers/gpu/drm/drm_dp_helper.c
> > > @@ -249,13 +249,7 @@ static int drm_dp_dpcd_access(struct
> > > drm_dp_aux *aux, u8
> > > request,
> > >  
> > >         mutex_lock(&aux->hw_mutex);
> > >  
> > > -       /*
> > > -        * The specification doesn't give any recommendation on how
> > > often to
> > > -        * retry native transactions. We used to retry 7 times like
> > > for
> > > -        * aux i2c transactions but real world devices this wasn't
> > > -        * sufficient, bump to 32 which makes Dell 4k monitors
> > > happier.
> > > -        */
> > > -       for (retry = 0; retry < 32; retry++) {
> > > +       for (retry = 0; retry < aux->num_retries; retry++) {
> > >                 if (ret != 0 && ret != -ETIMEDOUT) {
> > >                         usleep_range(AUX_RETRY_INTERVAL,
> > >                                      AUX_RETRY_INTERVAL + 100);
> > > @@ -1744,6 +1738,8 @@ void drm_dp_aux_init(struct drm_dp_aux *aux)
> > >         aux->ddc.retries = 3;
> > >  
> > >         aux->ddc.lock_ops = &drm_dp_i2c_lock_ops;
> > > +       /*Still making the Dell 4k monitors happier*/
> > > +       aux->num_retries = 32;
> > >  }
> > >  EXPORT_SYMBOL(drm_dp_aux_init);
> > >  
> > > diff --git a/include/drm/drm_dp_helper.h
> > > b/include/drm/drm_dp_helper.h
> > > index edffd1dcca3e..16cbfc8f5e66 100644
> > > --- a/include/drm/drm_dp_helper.h
> > > +++ b/include/drm/drm_dp_helper.h
> > > @@ -1876,6 +1876,7 @@ struct drm_dp_aux {
> > >         struct mutex hw_mutex;
> > >         struct work_struct crc_work;
> > >         u8 crc_count;
> > > +       int num_retries;
> > >         ssize_t (*transfer)(struct drm_dp_aux *aux,
> > >                             struct drm_dp_aux_msg *msg);
> > >         /**
Lyude Paul Feb. 13, 2021, 2:01 a.m. UTC | #6
(adding danvet to here, as I'd imagine they might be interested in seeing some
of this)

Thank you for the descriptive write up. I think we can solve some of the
problems you described here, however the patches that you submitted definitely
won't work as-is. In patch 2, by reverting Intel back to using only 7 retries
you technically break whatever monitor originally prompted us to bump the retry
count up to 32 in the first place. And we have to try our hardest to avoid
breaking people's displays when they were already working.

Also, I'll definitely point out though that a few of the issues you've pointed
out actually exist as workarounds for bad DisplayPort devices (which is a big
reason we have these helpers), but I think we might be able to fix some of the
issues you've mentioned by coming up with better workarounds. More details below

On Thu, 2021-02-11 at 06:56 +0000, Almahallawy, Khaled wrote:
> On Wed, 2021-02-10 at 13:03 -0500, Lyude Paul wrote:
> > On Wed, 2021-02-10 at 00:33 -0800, Khaled Almahallawy wrote:
> > > The number of AUX retries specified in the DP specs is 7.
> > > Currently, to make
> > > Dell 4k monitors happier, the number of retries are 32.
> > > i915 also retries 5 times (intel_dp_aux_xfer) which means in the
> > > case of AUX
> > > timeout we actually retries 32 * 5 = 160 times.
> > 
> > Is there any good reason for i915 to actually be doing retries
> > itself? It seems
> > like an easier solution here might just to be to fix i915 so it
> > doesn't retry,
> > and just rely on DRM to do the retries as appropriate.
> > 
> > That being said though, I'm open to this if there is a legitimate
> > reason for
> > i915 to be handling retries on its own
> 
> i915 or others may benefit from controlling AUX retries to optimize and
> minimize timing spent on the aux transactions.
>  
> drm_dpcd_access handles multiple aux retries cases the same way (retry
> 32 times at worst case). The 4 cases are:
> 1- *AUX receive or IO error*. I believe it is up to specific
> implementation/HW once it detects a receive error to retry based on
> their internal understanding using the timeout appropriate to the HW
> capabilities.

This makes sense so far

>  
> 2- *AUX Timeout* As the driver follows the specs for the recommended
> timeout timer as defined in section  (2.11.2 AUX Transaction
> Response/Reply Timeouts), the driver has more intelligence to know how
> many retries it needs. This is more useful in case of DP-ALT if some
> issues happen in Type-C stack that cause AUX timeout (e.g. USB3 dock
> detected as high speed (USB2) causing SBU/AUX lines to be disabled).
> Also the disconnect of MST hub/docks is dependent how fast a userspace
> disconnect all MST connectors.  Not all user spaces do it as fast like
> Chrome (ubuntu is an example): 
> https://chromium-review.googlesource.com/c/chromium/src/+/2512550/  

So - I'm not exactly following how this portion of the specification is relevant
to the issue that you're bringing up here. If I understand section 2.11.2
correctly, a "response" here is defined as a transmission from the DPRX which
informs us on the current state of the transaction that we've started. This
would be any one of the aux response statuses you've mentioned in this email:
AUX_NACK, AUX_ACK, or AUX_DEFER. It doesn't actually have anything to do with
the number of retries we have to do, because (ignoring the fact we retry on
AUX_NACKS in DRM for a moment here) that reply could be an AUX_DEFER which would
indicate that we have to resend the transaction and try again. I think this is
the correct understanding of section 2.11.2, because I definitely don't think
real world DP devices are able to actually complete a full AUX transaction
within a timespan of 300-400µs reliably for the most part.

The second thing to mention here is that regardless of the first point, I don't
think your point about MST displays needing to be removed by userspace is
correct. It is true that userspace can actually hold onto references to removed
MST connectors after they've been removed, but the main purpose of this being
possible is in order to accommodate legacy clients that wouldn't be able to
handle a connector disappearing under them cleanly. Once the MST topology which
a connector corresponds to is removed, the MST connector is removed ASAP from
the kernel's cache of the respective DisplayPort's MST topology along with all
of the connectors below it. At the same time, the respective DRM connectors are
also unregistered from userspace.

DRM explicitly doesn't allow any kind of client to enable new displays on
connectors that are unregistered, and will reject any modesets involving them
with the exception of modesets which only disable displays on those unregistered
connectors (this last part is mainly just to be nice to legacy clients). So it
doesn't really matter how quickly userspace disables the display configuration
on those connectors, from the kernel's perspective they're already gone and a
new MST topology can be connected to the system. The expectation is that even if
userspace doesn't turn those connectors off, the only possible move is to move
the CRTCs on them to new connectors or disable them outright.

Anyway-if there -is- actually a problem caused by userspace not disabling
displays on those connectors, that's definitely not intentional and not how
things are supposed to work. But this part of the MST helpers in DRM is pretty
solid at this point, and most times when people point out this oddity with MST
it ends up being a bug on userspace's end. Feel free to prove me wrong though!

> 
> 3- *AUX_NACK* DP spec mentions retry 3 times for NACK(2.16.1.3 GTC Lock
> Acquisition). However, sometimes we really don’t need any retry for
> NACK, if DPRX replied AUX_NACK for DPCD that it doesn’t support (e.g.
> reading LTTPR Capability and ID Field at DPCD F0000h ~ F0007).

Ah, yes, -this-. Originally we did actually just abort any transaction on
AUX_NACK, which is honestly the correct thing that we should be doing. But it
looks like I actually changed this at one point after finding some DP devices
that would send AUX_NACK instead of AUX_DEFER when they weren't yet ready to
receive messages. Luckily it seems I wrote up a pretty long explanation around
this when I changed this behavior back in:

82922da39190 ("drm/dp_helper: Retry aux transactions on all errors")

This was almost five years ago though when I was quite new to working on DRM, so
reading through this commit I already think I have some much better ideas for
how we can handle issues with DisplayPort devices like this. For starters, this
isn't the first workaround regarding a DisplayPort device or it's connected
source device waking up. There's also:

f808f63372cc ("drm/dp_helper: Perform throw-away read before actual read in
drm_dp_dpcd_read()")

Which was originally added as a workaround in the Intel driver, and then got
moved into the DRM helpers by me. The important thing about these two
workarounds that sticks out to me is that they're both issues with DP sinks/hubs
that only happen when either the source is first connected to a sink or hub, or
when the sink/hub is waking up from a low power state like D3. So it seems like
in both of these instances, after we manage one "successful" transaction (where
we define "successful" as both an AUX_ACK, -and- the monitor giving us a
sensible reply instead of random garbage) then things start to become normal and
match up with the spec.

The commonality between these two workarounds makes me think that we could solve
the AUX_NACK problem here (-and- this junky throwaway read) if we just kept
track of whether or not we've managed a "successful" transaction at least once,
after which point we can just immediately abort on any AUX_NACK we receive like
we used to. Which would solve the issue you're mentioning here with our handling
of AUX_NACKS, without breaking the monitors that actually need those
workarounds. First, we would we add a field to drm_dp_aux called "ready". Then,
we would want AUX transactions to go like this:

1. Whenever either of the following events occur:
   1a. A new DP device being connected to the system
   1b. Bringing an already-connected DP device out of a low power state through
       DPCD register 00600h or some other mechanism
   We set the "ready" field to False
2. Then, when the driver attempts an AUX channel transaction. We start to
   attempt a single DPCD register read from 00000h and retry until that
   transaction has completed with AUX_ACK. Take note that we will retry this
   transaction a total of 32 times like usual, but we will keep retrying even in
   the face of an AUX_NACK.
3. If the aforementioned transaction never completes with AUX_ACK, we consider
   the driver's DPCD transaction to have failed and return the appropriate
   return code.
   However-if the transaction does complete with AUX_ACK, we set the "ready"
   field to true.
4. We then attempt the original AUX transaction that the driver requested.
5. For the transaction in 4, and any subsequent transactions, as long as "ready"
   is set to True we go with the same 32 retries on AUX_DEFERs, but abort the
   transaction immediately on an AUX_NACK.

> 4- *AUX_DEFER* The specs stated we may retry 7 times on AUX_DEFER
> (3.5.1.2.3 LANEx_CHANNEL_EQ_DONE Sequence) and may terminate LT. Also
> with the increased usage of USB4, TBT/Type-C Docks/Displays, and active
> cables, the use of LTTPR becomes common which adds more challenge
> especially if we have multiple LTTPRs and all operate in non-LTTPR
> mode. In this case all LTTPRs will reply AUX_DEFER in 300us if it did
> not receive any aux response from other LTTPRs and DPRX. The SCR states
> we need to retry 7 times and not more than 50ms (DP v2.0 SCR on 8b/10b
> DPTX and LTTPR Requirements Update to Section 3.6)

I'm not sure where you're getting 7 retries and not more then 50ms from. I
currently have a copy of the DP v2.0 standard, but I'm not sure what SCR is. Is
this some sort of update to Section 3.6 from the DP v2.0 standard? Because I see
some mentions of 50ms response times in my copy of the 2.0 standard regarding
LTTPR, but they don't at all look related to what you're talking about here. (If
it is some update I don't have access to, I'll poke the X.org VESA contacts and
see if they can get me access to this).

Regardless though, I'm still not sure I understand the issue here. If we're
retrying a transaction, it's because the transaction didn't succeed - which in
turn means that something went wrong on the DPRX's end. In the event of
something going wrong with the DPRX for long enough that we end up exceeding
that 50ms timeout, wouldn't that already mean that the link training process is
already failing and needs to be aborted? Or are you saying that we would receive
AUX_NACKs in such an event, which could cause us to keep retrying the
transaction for longer then 50ms, resulting in the DPRX ending the link training
prematurely? If it's the former, hopefully the solution I suggested for your
third point would end up fixing this anyway (but I'm always open to discussion
if that solution isn't enough).

> 
> In addition I believe this function is not correct in treating
> AUX_DEFER and AUX_NACK as -EIO. Especially for AUX_DEFER, it is a valid
> 1 byte response that can provide a valuable debugging information
> especially in the case of on-board LTTPR where there is no way to
> capture this AUX response between the SoC and LTTPR unless you solder
> two wires on AUX_P and AUX_N lines and use i2c/aux analyzer to decode.
> At least we should provide the same debug information as we do in
> drm_dp_i2c_do_msg by printing AUX_DEFER and AUX_NACK.
> 
> Thank you for your feedback and review.

Is the debugging output for DPCD transactions (e.g. setting bit 0x100 for
drm.debug on the kernel commandline) not sufficient enough for this kind of
debugging? I'm fine with us being more specific with our return codes in that
case, I just wonder if they would conflict with any of the error codes some of
the DRM DP helpers already return.

Anyway-let me know if there's anything in my responses I need to clarify, or
anything I missed here. Also, sorry for the very long response! There was a lot
of context I had to dump here for this to make sense.

> --Khaled
> > 
> > > So making the number of aux retires a variable to allow for fine
> > > tuning and
> > > optimization of aux timing.
> > > 
> > > Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com>
> > > ---
> > >  drivers/gpu/drm/drm_dp_helper.c | 10 +++-------
> > >  include/drm/drm_dp_helper.h     |  1 +
> > >  2 files changed, 4 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/drm_dp_helper.c
> > > b/drivers/gpu/drm/drm_dp_helper.c
> > > index eedbb48815b7..8fdf57b4a06c 100644
> > > --- a/drivers/gpu/drm/drm_dp_helper.c
> > > +++ b/drivers/gpu/drm/drm_dp_helper.c
> > > @@ -249,13 +249,7 @@ static int drm_dp_dpcd_access(struct
> > > drm_dp_aux *aux, u8
> > > request,
> > >  
> > >         mutex_lock(&aux->hw_mutex);
> > >  
> > > -       /*
> > > -        * The specification doesn't give any recommendation on how
> > > often to
> > > -        * retry native transactions. We used to retry 7 times like
> > > for
> > > -        * aux i2c transactions but real world devices this wasn't
> > > -        * sufficient, bump to 32 which makes Dell 4k monitors
> > > happier.
> > > -        */
> > > -       for (retry = 0; retry < 32; retry++) {
> > > +       for (retry = 0; retry < aux->num_retries; retry++) {
> > >                 if (ret != 0 && ret != -ETIMEDOUT) {
> > >                         usleep_range(AUX_RETRY_INTERVAL,
> > >                                      AUX_RETRY_INTERVAL + 100);
> > > @@ -1744,6 +1738,8 @@ void drm_dp_aux_init(struct drm_dp_aux *aux)
> > >         aux->ddc.retries = 3;
> > >  
> > >         aux->ddc.lock_ops = &drm_dp_i2c_lock_ops;
> > > +       /*Still making the Dell 4k monitors happier*/
> > > +       aux->num_retries = 32;
> > >  }
> > >  EXPORT_SYMBOL(drm_dp_aux_init);
> > >  
> > > diff --git a/include/drm/drm_dp_helper.h
> > > b/include/drm/drm_dp_helper.h
> > > index edffd1dcca3e..16cbfc8f5e66 100644
> > > --- a/include/drm/drm_dp_helper.h
> > > +++ b/include/drm/drm_dp_helper.h
> > > @@ -1876,6 +1876,7 @@ struct drm_dp_aux {
> > >         struct mutex hw_mutex;
> > >         struct work_struct crc_work;
> > >         u8 crc_count;
> > > +       int num_retries;
> > >         ssize_t (*transfer)(struct drm_dp_aux *aux,
> > >                             struct drm_dp_aux_msg *msg);
> > >         /**
Almahallawy, Khaled Feb. 13, 2021, 10:26 a.m. UTC | #7
On Fri, 2021-02-12 at 21:01 -0500, Lyude Paul wrote:
> (adding danvet to here, as I'd imagine they might be interested in
> seeing some
> of this)
> 
> Thank you for the descriptive write up. I think we can solve some of
> the
> problems you described here, however the patches that you submitted
> definitely
> won't work as-is. In patch 2, by reverting Intel back to using only 7
> retries
> you technically break whatever monitor originally prompted us to bump
> the retry
> count up to 32 in the first place. And we have to try our hardest to
> avoid
> breaking people's displays when they were already working.

Got it. Thank you for pointing that out. 
> 
> Also, I'll definitely point out though that a few of the issues
> you've pointed
> out actually exist as workarounds for bad DisplayPort devices (which
> is a big
> reason we have these helpers), but I think we might be able to fix
> some of the
> issues you've mentioned by coming up with better workarounds. More
> details below
> 
> On Thu, 2021-02-11 at 06:56 +0000, Almahallawy, Khaled wrote:
> > On Wed, 2021-02-10 at 13:03 -0500, Lyude Paul wrote:
> > > On Wed, 2021-02-10 at 00:33 -0800, Khaled Almahallawy wrote:
> > > > The number of AUX retries specified in the DP specs is 7.
> > > > Currently, to make
> > > > Dell 4k monitors happier, the number of retries are 32.
> > > > i915 also retries 5 times (intel_dp_aux_xfer) which means in
> > > > the
> > > > case of AUX
> > > > timeout we actually retries 32 * 5 = 160 times.
> > > 
> > > Is there any good reason for i915 to actually be doing retries
> > > itself? It seems
> > > like an easier solution here might just to be to fix i915 so it
> > > doesn't retry,
> > > and just rely on DRM to do the retries as appropriate.
> > > 
> > > That being said though, I'm open to this if there is a legitimate
> > > reason for
> > > i915 to be handling retries on its own
> > 
> > i915 or others may benefit from controlling AUX retries to optimize
> > and
> > minimize timing spent on the aux transactions.
> >  
> > drm_dpcd_access handles multiple aux retries cases the same way
> > (retry
> > 32 times at worst case). The 4 cases are:
> > 1- *AUX receive or IO error*. I believe it is up to specific
> > implementation/HW once it detects a receive error to retry based on
> > their internal understanding using the timeout appropriate to the
> > HW
> > capabilities.
> 
> This makes sense so far
> 
> >  
> > 2- *AUX Timeout* As the driver follows the specs for the
> > recommended
> > timeout timer as defined in section  (2.11.2 AUX Transaction
> > Response/Reply Timeouts), the driver has more intelligence to know
> > how
> > many retries it needs. This is more useful in case of DP-ALT if
> > some
> > issues happen in Type-C stack that cause AUX timeout (e.g. USB3
> > dock
> > detected as high speed (USB2) causing SBU/AUX lines to be
> > disabled).
> > Also the disconnect of MST hub/docks is dependent how fast a
> > userspace
> > disconnect all MST connectors.  Not all user spaces do it as fast
> > like
> > Chrome (ubuntu is an example): 
> > https://chromium-review.googlesource.com/c/chromium/src/+/2512550/ 
> >  
> 
> So - I'm not exactly following how this portion of the specification
> is relevant
> to the issue that you're bringing up here. If I understand section
> 2.11.2
> correctly, a "response" here is defined as a transmission from the
> DPRX which
> informs us on the current state of the transaction that we've
> started. This
> would be any one of the aux response statuses you've mentioned in
> this email:
> AUX_NACK, AUX_ACK, or AUX_DEFER. It doesn't actually have anything to
> do with
> the number of retries we have to do, because (ignoring the fact we
> retry on
> AUX_NACKS in DRM for a moment here) that reply could be an AUX_DEFER
> which would
> indicate that we have to resend the transaction and try again. I
> think this is
> the correct understanding of section 2.11.2, because I definitely
> don't think
> real world DP devices are able to actually complete a full AUX
> transaction
> within a timespan of 300-400µs reliably for the most part.

The AUX timeout I am considering is from the point of view of DPTX
(Display Source) not DPRX. Quoting DP2.0 Spec- Sec 2.11.2:
“If the DPTX does not receive a reply from the DPRX, the DPTX shall
wait for an AUX Reply Timeout *timer period* before initiating the next
AUX Request transaction.
…
For all AUX transactions, the AUX Reply Timeout *timer* starts ticking
after the DPTX finishes transmitting the AUX STOP condition.
The timer is *reset* when the AUX Reply Timeout timer period has
*elapsed*”
 
This AUX timeout retries is also defined in DP 1.4a Link Layer CTS
r1.0:
“4.2.1.1 Source DUT Retry on No-Reply during AUX Read after HPD Plug
Event
Test Objective:
This test confirms that the Source DUT retries AUX requests on HPD plug
event if the Sink does not initially reply.”
 
Based on this, when the Aux timeout *timer* configured by
hardware/driver is *expired* before the DP source receives any response
from the DP sink (whether this response is ACK or DEFER or NACK), the
DP Source may retry the same AUX transaction. 
 
These retries for timeout is handled by this part of the function:
		if (ret != 0 && ret != -ETIMEDOUT) {
			usleep_range(AUX_RETRY_INTERVAL,
				     AUX_RETRY_INTERVAL + 100);
		}
 
Note that ACK, DEFER, and NACK are handled in this part:
if (ret >= 0) {
	....
}
 
If you check aux->transfer(aux, &msg) implemented by display drivers.
The ETIMEDOUT is returned when Source doesn’t receive anything from
sink
 
I915 - intel_dp_aux.c/intel_dp_aux_xfer:
	
	if (status & DP_AUX_CH_CTL_TIME_OUT_ERROR) {
		drm_dbg_kms(&i915->drm, "%s: timeout (status
0x%08x)\n",
			    intel_dp->aux.name, status);
		ret = -ETIMEDOUT;
		goto out;
	}
 
AMD - atombios_dp.c/amdgpu_atombios_dp_process_aux_ch
	/* timeout */
	if (args.v2.ucReplyStatus == 1) {
		r = -ETIMEDOUT;
		goto done;
	}
 
Tegra: dpaux.c/tegra_dpaux_transfer
	status = wait_for_completion_timeout(&dpaux->complete,
timeout);
	if (!status)
		return -ETIMEDOUT;
 
Based on my experience, AUX timeout likely to happen in two obvious
scenarios:
(1)Disconnect flow. (2)DP-ALT mode. In DP-ALT,  displays rely on USB PD
to enable AUX Lines. Anything can happen in this logic leads to aux
timeout. That is why I believe as the driver knows which mode it
operates on (whether legacy/native DP or DP-ALT mode or even TBT
tunneled mode),  Driver may know how many times it should retry when
aux transactions timed out. 
> 
> The second thing to mention here is that regardless of the first
> point, I don't
> think your point about MST displays needing to be removed by
> userspace is
> correct. It is true that userspace can actually hold onto references
> to removed
> MST connectors after they've been removed, but the main purpose of
> this being
> possible is in order to accommodate legacy clients that wouldn't be
> able to
> handle a connector disappearing under them cleanly. Once the MST
> topology which
> a connector corresponds to is removed, the MST connector is removed
> ASAP from
> the kernel's cache of the respective DisplayPort's MST topology along
> with all
> of the connectors below it. At the same time, the respective DRM
> connectors are
> also unregistered from userspace.
> 
> DRM explicitly doesn't allow any kind of client to enable new
> displays on
> connectors that are unregistered, and will reject any modesets
> involving them
> with the exception of modesets which only disable displays on those
> unregistered
> connectors (this last part is mainly just to be nice to legacy
> clients). So it
> doesn't really matter how quickly userspace disables the display
> configuration
> on those connectors, from the kernel's perspective they're already
> gone and a
> new MST topology can be connected to the system. The expectation is
> that even if
> userspace doesn't turn those connectors off, the only possible move
> is to move
> the CRTCs on them to new connectors or disable them outright.
> 
> Anyway-if there -is- actually a problem caused by userspace not
> disabling
> displays on those connectors, that's definitely not intentional and
> not how
> things are supposed to work. But this part of the MST helpers in DRM
> is pretty
> solid at this point, and most times when people point out this oddity
> with MST
> it ends up being a bug on userspace's end. Feel free to prove me
> wrong though!

Apology for not being clear on the MST issue. My complaint is not about
DRM support for MST. It is about userspace support for MST during
disconnect. Based on my previous experience: 
https://patchwork.freedesktop.org/patch/395901/, as userspace failed to
try failing commit, or keep retrying a primary MST connector not the
secondary as i saw in ubuntu. My observation is based on using the same
Dell Dock, using the same kernel and driver on the same HW(TGL), but
with different userspace (chrome vs Ubuntu). After the chrome fix i
mentioned above, disconnecting Dell dock in Chrome is much faster than
ubuntu.
> 
> > 3- *AUX_NACK* DP spec mentions retry 3 times for NACK(2.16.1.3 GTC
> > Lock
> > Acquisition). However, sometimes we really don’t need any retry for
> > NACK, if DPRX replied AUX_NACK for DPCD that it doesn’t support
> > (e.g.
> > reading LTTPR Capability and ID Field at DPCD F0000h ~ F0007).
> 
> Ah, yes, -this-. Originally we did actually just abort any
> transaction on
> AUX_NACK, which is honestly the correct thing that we should be
> doing. But it
> looks like I actually changed this at one point after finding some DP
> devices
> that would send AUX_NACK instead of AUX_DEFER when they weren't yet
> ready to
> receive messages. Luckily it seems I wrote up a pretty long
> explanation around
> this when I changed this behavior back in:
> 
> 82922da39190 ("drm/dp_helper: Retry aux transactions on all errors")
> 
> This was almost five years ago though when I was quite new to working
> on DRM, so
> reading through this commit I already think I have some much better
> ideas for
> how we can handle issues with DisplayPort devices like this. For
> starters, this
> isn't the first workaround regarding a DisplayPort device or it's
> connected
> source device waking up. There's also:
> 
> f808f63372cc ("drm/dp_helper: Perform throw-away read before actual
> read in
> drm_dp_dpcd_read()")
> 
> Which was originally added as a workaround in the Intel driver, and
> then got
> moved into the DRM helpers by me. The important thing about these two
> workarounds that sticks out to me is that they're both issues with DP
> sinks/hubs
> that only happen when either the source is first connected to a sink
> or hub, or
> when the sink/hub is waking up from a low power state like D3. So it
> seems like
> in both of these instances, after we manage one "successful"
> transaction (where
> we define "successful" as both an AUX_ACK, -and- the monitor giving
> us a
> sensible reply instead of random garbage) then things start to become
> normal and
> match up with the spec.
> 
> The commonality between these two workarounds makes me think that we
> could solve
> the AUX_NACK problem here (-and- this junky throwaway read) if we
> just kept
> track of whether or not we've managed a "successful" transaction at
> least once,
> after which point we can just immediately abort on any AUX_NACK we
> receive like
> we used to. Which would solve the issue you're mentioning here with
> our handling
> of AUX_NACKS, without breaking the monitors that actually need those
> workarounds. First, we would we add a field to drm_dp_aux called
> "ready". Then,
> we would want AUX transactions to go like this:
> 
> 1. Whenever either of the following events occur:
>    1a. A new DP device being connected to the system
>    1b. Bringing an already-connected DP device out of a low power
> state through
>        DPCD register 00600h or some other mechanism
>    We set the "ready" field to False
> 2. Then, when the driver attempts an AUX channel transaction. We
> start to
>    attempt a single DPCD register read from 00000h and retry until
> that
>    transaction has completed with AUX_ACK. Take note that we will
> retry this
>    transaction a total of 32 times like usual, but we will keep
> retrying even in
>    the face of an AUX_NACK.
> 3. If the aforementioned transaction never completes with AUX_ACK, we
> consider
>    the driver's DPCD transaction to have failed and return the
> appropriate
>    return code.
>    However-if the transaction does complete with AUX_ACK, we set the
> "ready"
>    field to true.
> 4. We then attempt the original AUX transaction that the driver
> requested.
> 5. For the transaction in 4, and any subsequent transactions, as long
> as "ready"
>    is set to True we go with the same 32 retries on AUX_DEFERs, but
> abort the
>    transaction immediately on an AUX_NACK.

Thank you for providing the history for NACK. And Yes this solution
makes sense. If a single transaction is successful we can abort on all
subsequent AUX_NACK. I will be happy to give this a try. 


> 
> > 4- *AUX_DEFER* The specs stated we may retry 7 times on AUX_DEFER
> > (3.5.1.2.3 LANEx_CHANNEL_EQ_DONE Sequence) and may terminate LT.
> > Also
> > with the increased usage of USB4, TBT/Type-C Docks/Displays, and
> > active
> > cables, the use of LTTPR becomes common which adds more challenge
> > especially if we have multiple LTTPRs and all operate in non-LTTPR
> > mode. In this case all LTTPRs will reply AUX_DEFER in 300us if it
> > did
> > not receive any aux response from other LTTPRs and DPRX. The SCR
> > states
> > we need to retry 7 times and not more than 50ms (DP v2.0 SCR on
> > 8b/10b
> > DPTX and LTTPR Requirements Update to Section 3.6)
> 
> I'm not sure where you're getting 7 retries and not more then 50ms
> from. I
> currently have a copy of the DP v2.0 standard, but I'm not sure what
> SCR is. Is
> this some sort of update to Section 3.6 from the DP v2.0 standard?
> Because I see
> some mentions of 50ms response times in my copy of the 2.0 standard
> regarding
> LTTPR, but they don't at all look related to what you're talking
> about here. (If
> it is some update I don't have access to, I'll poke the X.org VESA
> contacts and
> see if they can get me access to this).

The link for this SCR (requires VESA memebership): 
https://groups.vesa.org/wg/Link/document/15764


> Regardless though, I'm still not sure I understand the issue here. If
> we're
> retrying a transaction, it's because the transaction didn't succeed -
> which in
> turn means that something went wrong on the DPRX's end. In the event
> of
> something going wrong with the DPRX for long enough that we end up
> exceeding
> that 50ms timeout, wouldn't that already mean that the link training
> process is
> already failing and needs to be aborted? Or are you saying that we
> would receive
> AUX_NACKs in such an event, which could cause us to keep retrying the
> transaction for longer then 50ms, resulting in the DPRX ending the
> link training
> prematurely? If it's the former, hopefully the solution I suggested
> for your
> third point would end up fixing this anyway (but I'm always open to
> discussion
> if that solution isn't enough).

I will try to explain based on my understanding. 
The problem happens when we have multiple cascaded LTTPRs between DPTX
and DPRX and all operate in Non-LTTPR mode. The SCR states the
following:
“A DPTX shall retry the same AUX request at least 7 times upon
receiving AUX_DEFER reply.
* How many times to retry at most is a DPTX implementation-specific
choice. However, it shall not retry indefinitely (e.g., longer than
50ms) to avoid soft lock-up condition”
 
To understand that, assume the following configuration:
 
DPTX-----LTTPR4------LTTTPR3------LTTPR2------LTTTPR2------DPRX
 
If DPTX didn’t configure LTTPRs 1-4 as transparent or non-transparent,
all these LTTPRs would operate in non-LTTPR mode. In this mode, the SCR
states:
“In case there is no AUX reply transaction to forward within the AUX
Response Timeout period of 300us, the UFP of an LTTPR in Non-LTTPR Mode
shall issue an AUX_DEFER reply transaction.”
 
That means if:
1- DPTX sends AUX request to LTTPR4 which forward to LTTPR3, then
LTTPR3 forward to LTTPR2 and so on until it is received by DPRX
2- While LTTPR4 waiting for reply, if 300us is elapsed. LTTPR4 will
reply AUX_DEFER
3- DPTX will receive AUX_DEFER from LTTPR4 and will resend the AUX
request.
4- As it is a long path, LTTPR4 may reply AUX_DEFER again. Resulting in
DPTX to resend the AUX request.
5- On the same time while LTTPR3 waiting for reply, 300us elapsed,
resulting LTTPR3 to issue AUX_DEFER to DPTX which means DPTX will
reissue the AUX request.
6. And so on until Aux request is received by DPRX and DPRX starts
sending the reply. 
 
During this journey of Aux request from DPTX to DPRX and AUX reply to
DPTX,  LTTPR1-4 will keep issuing AUX_DEFER every 300us if they didn’t
receive any reply on the DFP. And with each AUX_DEFER received, the
DPTX will resend the Aux request. 
 
The SCR is trying to put a limit on this AUX_DEFER as it can go
indefinitely. 


> 
> > In addition I believe this function is not correct in treating
> > AUX_DEFER and AUX_NACK as -EIO. Especially for AUX_DEFER, it is a
> > valid
> > 1 byte response that can provide a valuable debugging information
> > especially in the case of on-board LTTPR where there is no way to
> > capture this AUX response between the SoC and LTTPR unless you
> > solder
> > two wires on AUX_P and AUX_N lines and use i2c/aux analyzer to
> > decode.
> > At least we should provide the same debug information as we do in
> > drm_dp_i2c_do_msg by printing AUX_DEFER and AUX_NACK.
> > 
> > Thank you for your feedback and review.
> 
> Is the debugging output for DPCD transactions (e.g. setting bit 0x100
> for
> drm.debug on the kernel commandline) not sufficient enough for this
> kind of
> debugging? I'm fine with us being more specific with our return codes
> in that
> case, I just wonder if they would conflict with any of the error
> codes some of
> the DRM DP helpers already return.

I believe it is not sufficient. Please note for AUX_NACK and AUX_DEFER
even with 0x100 enabled, we don't know from the kernel logs how many
times we received NACK or DEFER because drm_dp_dump_access is called
after drm_dp_dpcd_access in drm_dp_dpcd_read and  drm_dp_dpcd_write. On
the contrary, in  drm_dp_i2c_do_msg, we can know from kernel logs how
many times we got NACK or DEFER. 
 
This is important if we use an AUX analyzer like Ellisys or DPA-400.
What the aux analyze captures will be more than what we see in the
kernel logs. Although the kernel logs should be at least equal to the
AUX captured by the aux analyzer or more.
 
I think we should have the option to indicate how many times we got
DEFER or NACK. Something like that wihtout even changing the return
value of this function:
 
 -269,8 +269,12 @@ static int drm_dp_dpcd_access(struct drm_dp_aux
*aux, u8 request,
                                        goto unlock;
 
                                ret = -EPROTO;
-                       } else
-                               ret = -EIO;
+                       } else if (native_reply ==
DP_AUX_NATIVE_REPLY_DEFER)
+                               DRM_DEBUG_DP("%s: AUX_DEFER\n", aux-
>name);
+                       else if (native_reply ==
DP_AUX_NATIVE_REPLY_NACK)
+                               DRM_DEBUG_DP("%s: AUX_NACK\n", aux-
>name);
+
+                       ret = -EIO;
                }


> Anyway-let me know if there's anything in my responses I need to
> clarify, or
> anything I missed here. Also, sorry for the very long response! There
> was a lot
> of context I had to dump here for this to make sense.

Thank you so much for your detailed explanation and really appreciate
you taking the time to reply to me. 

I apologize for the long email as well :)
 
--Khaled

> 
> > --Khaled
> > > > So making the number of aux retires a variable to allow for
> > > > fine
> > > > tuning and
> > > > optimization of aux timing.
> > > > 
> > > > Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com
> > > > >
> > > > ---
> > > >  drivers/gpu/drm/drm_dp_helper.c | 10 +++-------
> > > >  include/drm/drm_dp_helper.h     |  1 +
> > > >  2 files changed, 4 insertions(+), 7 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/drm_dp_helper.c
> > > > b/drivers/gpu/drm/drm_dp_helper.c
> > > > index eedbb48815b7..8fdf57b4a06c 100644
> > > > --- a/drivers/gpu/drm/drm_dp_helper.c
> > > > +++ b/drivers/gpu/drm/drm_dp_helper.c
> > > > @@ -249,13 +249,7 @@ static int drm_dp_dpcd_access(struct
> > > > drm_dp_aux *aux, u8
> > > > request,
> > > >  
> > > >         mutex_lock(&aux->hw_mutex);
> > > >  
> > > > -       /*
> > > > -        * The specification doesn't give any recommendation on
> > > > how
> > > > often to
> > > > -        * retry native transactions. We used to retry 7 times
> > > > like
> > > > for
> > > > -        * aux i2c transactions but real world devices this
> > > > wasn't
> > > > -        * sufficient, bump to 32 which makes Dell 4k monitors
> > > > happier.
> > > > -        */
> > > > -       for (retry = 0; retry < 32; retry++) {
> > > > +       for (retry = 0; retry < aux->num_retries; retry++) {
> > > >                 if (ret != 0 && ret != -ETIMEDOUT) {
> > > >                         usleep_range(AUX_RETRY_INTERVAL,
> > > >                                      AUX_RETRY_INTERVAL + 100);
> > > > @@ -1744,6 +1738,8 @@ void drm_dp_aux_init(struct drm_dp_aux
> > > > *aux)
> > > >         aux->ddc.retries = 3;
> > > >  
> > > >         aux->ddc.lock_ops = &drm_dp_i2c_lock_ops;
> > > > +       /*Still making the Dell 4k monitors happier*/
> > > > +       aux->num_retries = 32;
> > > >  }
> > > >  EXPORT_SYMBOL(drm_dp_aux_init);
> > > >  
> > > > diff --git a/include/drm/drm_dp_helper.h
> > > > b/include/drm/drm_dp_helper.h
> > > > index edffd1dcca3e..16cbfc8f5e66 100644
> > > > --- a/include/drm/drm_dp_helper.h
> > > > +++ b/include/drm/drm_dp_helper.h
> > > > @@ -1876,6 +1876,7 @@ struct drm_dp_aux {
> > > >         struct mutex hw_mutex;
> > > >         struct work_struct crc_work;
> > > >         u8 crc_count;
> > > > +       int num_retries;
> > > >         ssize_t (*transfer)(struct drm_dp_aux *aux,
> > > >                             struct drm_dp_aux_msg *msg);
> > > >         /**
Lyude Paul Feb. 16, 2021, 7:06 p.m. UTC | #8
On Sat, 2021-02-13 at 10:26 +0000, Almahallawy, Khaled wrote:
> On Fri, 2021-02-12 at 21:01 -0500, Lyude Paul wrote:
> > (adding danvet to here, as I'd imagine they might be interested in
> > seeing some
> > of this)
> > 
> > Thank you for the descriptive write up. I think we can solve some of
> > the
> > problems you described here, however the patches that you submitted
> > definitely
> > won't work as-is. In patch 2, by reverting Intel back to using only 7
> > retries
> > you technically break whatever monitor originally prompted us to bump
> > the retry
> > count up to 32 in the first place. And we have to try our hardest to
> > avoid
> > breaking people's displays when they were already working.
> 
> Got it. Thank you for pointing that out. 
> > 
> > Also, I'll definitely point out though that a few of the issues
> > you've pointed
> > out actually exist as workarounds for bad DisplayPort devices (which
> > is a big
> > reason we have these helpers), but I think we might be able to fix
> > some of the
> > issues you've mentioned by coming up with better workarounds. More
> > details below
> > 
> > On Thu, 2021-02-11 at 06:56 +0000, Almahallawy, Khaled wrote:
> > > On Wed, 2021-02-10 at 13:03 -0500, Lyude Paul wrote:
> > > > On Wed, 2021-02-10 at 00:33 -0800, Khaled Almahallawy wrote:
> > > > > The number of AUX retries specified in the DP specs is 7.
> > > > > Currently, to make
> > > > > Dell 4k monitors happier, the number of retries are 32.
> > > > > i915 also retries 5 times (intel_dp_aux_xfer) which means in
> > > > > the
> > > > > case of AUX
> > > > > timeout we actually retries 32 * 5 = 160 times.
> > > > 
> > > > Is there any good reason for i915 to actually be doing retries
> > > > itself? It seems
> > > > like an easier solution here might just to be to fix i915 so it
> > > > doesn't retry,
> > > > and just rely on DRM to do the retries as appropriate.
> > > > 
> > > > That being said though, I'm open to this if there is a legitimate
> > > > reason for
> > > > i915 to be handling retries on its own
> > > 
> > > i915 or others may benefit from controlling AUX retries to optimize
> > > and
> > > minimize timing spent on the aux transactions.
> > >  
> > > drm_dpcd_access handles multiple aux retries cases the same way
> > > (retry
> > > 32 times at worst case). The 4 cases are:
> > > 1- *AUX receive or IO error*. I believe it is up to specific
> > > implementation/HW once it detects a receive error to retry based on
> > > their internal understanding using the timeout appropriate to the
> > > HW
> > > capabilities.
> > 
> > This makes sense so far
> > 
> > >  
> > > 2- *AUX Timeout* As the driver follows the specs for the
> > > recommended
> > > timeout timer as defined in section  (2.11.2 AUX Transaction
> > > Response/Reply Timeouts), the driver has more intelligence to know
> > > how
> > > many retries it needs. This is more useful in case of DP-ALT if
> > > some
> > > issues happen in Type-C stack that cause AUX timeout (e.g. USB3
> > > dock
> > > detected as high speed (USB2) causing SBU/AUX lines to be
> > > disabled).
> > > Also the disconnect of MST hub/docks is dependent how fast a
> > > userspace
> > > disconnect all MST connectors.  Not all user spaces do it as fast
> > > like
> > > Chrome (ubuntu is an example): 
> > > https://chromium-review.googlesource.com/c/chromium/src/+/2512550/ 
> > >  
> > 
> > So - I'm not exactly following how this portion of the specification
> > is relevant
> > to the issue that you're bringing up here. If I understand section
> > 2.11.2
> > correctly, a "response" here is defined as a transmission from the
> > DPRX which
> > informs us on the current state of the transaction that we've
> > started. This
> > would be any one of the aux response statuses you've mentioned in
> > this email:
> > AUX_NACK, AUX_ACK, or AUX_DEFER. It doesn't actually have anything to
> > do with
> > the number of retries we have to do, because (ignoring the fact we
> > retry on
> > AUX_NACKS in DRM for a moment here) that reply could be an AUX_DEFER
> > which would
> > indicate that we have to resend the transaction and try again. I
> > think this is
> > the correct understanding of section 2.11.2, because I definitely
> > don't think
> > real world DP devices are able to actually complete a full AUX
> > transaction
> > within a timespan of 300-400µs reliably for the most part.
> 
> The AUX timeout I am considering is from the point of view of DPTX
> (Display Source) not DPRX. Quoting DP2.0 Spec- Sec 2.11.2:
> “If the DPTX does not receive a reply from the DPRX, the DPTX shall
> wait for an AUX Reply Timeout *timer period* before initiating the next
> AUX Request transaction.
> …
> For all AUX transactions, the AUX Reply Timeout *timer* starts ticking
> after the DPTX finishes transmitting the AUX STOP condition.
> The timer is *reset* when the AUX Reply Timeout timer period has
> *elapsed*”
>  
> This AUX timeout retries is also defined in DP 1.4a Link Layer CTS
> r1.0:
> “4.2.1.1 Source DUT Retry on No-Reply during AUX Read after HPD Plug
> Event
> Test Objective:
> This test confirms that the Source DUT retries AUX requests on HPD plug
> event if the Sink does not initially reply.”
>  
> Based on this, when the Aux timeout *timer* configured by
> hardware/driver is *expired* before the DP source receives any response
> from the DP sink (whether this response is ACK or DEFER or NACK), the
> DP Source may retry the same AUX transaction. 
>  
> These retries for timeout is handled by this part of the function:
>                 if (ret != 0 && ret != -ETIMEDOUT) {
>                         usleep_range(AUX_RETRY_INTERVAL,
>                                      AUX_RETRY_INTERVAL + 100);
>                 }
>  
> Note that ACK, DEFER, and NACK are handled in this part:
> if (ret >= 0) {
>         ....
> }
>  
> If you check aux->transfer(aux, &msg) implemented by display drivers.
> The ETIMEDOUT is returned when Source doesn’t receive anything from
> sink
>  
> I915 - intel_dp_aux.c/intel_dp_aux_xfer:
>         
>         if (status & DP_AUX_CH_CTL_TIME_OUT_ERROR) {
>                 drm_dbg_kms(&i915->drm, "%s: timeout (status
> 0x%08x)\n",
>                             intel_dp->aux.name, status);
>                 ret = -ETIMEDOUT;
>                 goto out;
>         }
>  
> AMD - atombios_dp.c/amdgpu_atombios_dp_process_aux_ch
>         /* timeout */
>         if (args.v2.ucReplyStatus == 1) {
>                 r = -ETIMEDOUT;
>                 goto done;
>         }
>  
> Tegra: dpaux.c/tegra_dpaux_transfer
>         status = wait_for_completion_timeout(&dpaux->complete,
> timeout);
>         if (!status)
>                 return -ETIMEDOUT;
>  
> Based on my experience, AUX timeout likely to happen in two obvious
> scenarios:
> (1)Disconnect flow. (2)DP-ALT mode. In DP-ALT,  displays rely on USB PD
> to enable AUX Lines. Anything can happen in this logic leads to aux
> timeout. That is why I believe as the driver knows which mode it
> operates on (whether legacy/native DP or DP-ALT mode or even TBT
> tunneled mode),  Driver may know how many times it should retry when
> aux transactions timed out. 

So if the driver knows to abort a request prematurely, what if we just
defined a return code that the driver could return from ->transfer() to
indicate to the DP helpers "don't retry this transaction anymore, just
cancel it now". Something like -ECANCELED?

> > 
> > The second thing to mention here is that regardless of the first
> > point, I don't
> > think your point about MST displays needing to be removed by
> > userspace is
> > correct. It is true that userspace can actually hold onto references
> > to removed
> > MST connectors after they've been removed, but the main purpose of
> > this being
> > possible is in order to accommodate legacy clients that wouldn't be
> > able to
> > handle a connector disappearing under them cleanly. Once the MST
> > topology which
> > a connector corresponds to is removed, the MST connector is removed
> > ASAP from
> > the kernel's cache of the respective DisplayPort's MST topology along
> > with all
> > of the connectors below it. At the same time, the respective DRM
> > connectors are
> > also unregistered from userspace.
> > 
> > DRM explicitly doesn't allow any kind of client to enable new
> > displays on
> > connectors that are unregistered, and will reject any modesets
> > involving them
> > with the exception of modesets which only disable displays on those
> > unregistered
> > connectors (this last part is mainly just to be nice to legacy
> > clients). So it
> > doesn't really matter how quickly userspace disables the display
> > configuration
> > on those connectors, from the kernel's perspective they're already
> > gone and a
> > new MST topology can be connected to the system. The expectation is
> > that even if
> > userspace doesn't turn those connectors off, the only possible move
> > is to move
> > the CRTCs on them to new connectors or disable them outright.
> > 
> > Anyway-if there -is- actually a problem caused by userspace not
> > disabling
> > displays on those connectors, that's definitely not intentional and
> > not how
> > things are supposed to work. But this part of the MST helpers in DRM
> > is pretty
> > solid at this point, and most times when people point out this oddity
> > with MST
> > it ends up being a bug on userspace's end. Feel free to prove me
> > wrong though!
> 
> Apology for not being clear on the MST issue. My complaint is not about
> DRM support for MST. It is about userspace support for MST during
> disconnect. Based on my previous experience: 
> https://patchwork.freedesktop.org/patch/395901/, as userspace failed to
> try failing commit, or keep retrying a primary MST connector not the
> secondary as i saw in ubuntu. My observation is based on using the same
> Dell Dock, using the same kernel and driver on the same HW(TGL), but
> with different userspace (chrome vs Ubuntu). After the chrome fix i
> mentioned above, disconnecting Dell dock in Chrome is much faster than
> ubuntu.

Yeah-this is something that has to be fixed on userspace's end, also
seeing as ubuntu still uses gnome-shell by default it probably doesn't
help that gnome-shell doesn't yet have support for atomic modesetting.

> > 
> > > 3- *AUX_NACK* DP spec mentions retry 3 times for NACK(2.16.1.3 GTC
> > > Lock
> > > Acquisition). However, sometimes we really don’t need any retry for
> > > NACK, if DPRX replied AUX_NACK for DPCD that it doesn’t support
> > > (e.g.
> > > reading LTTPR Capability and ID Field at DPCD F0000h ~ F0007).
> > 
> > Ah, yes, -this-. Originally we did actually just abort any
> > transaction on
> > AUX_NACK, which is honestly the correct thing that we should be
> > doing. But it
> > looks like I actually changed this at one point after finding some DP
> > devices
> > that would send AUX_NACK instead of AUX_DEFER when they weren't yet
> > ready to
> > receive messages. Luckily it seems I wrote up a pretty long
> > explanation around
> > this when I changed this behavior back in:
> > 
> > 82922da39190 ("drm/dp_helper: Retry aux transactions on all errors")
> > 
> > This was almost five years ago though when I was quite new to working
> > on DRM, so
> > reading through this commit I already think I have some much better
> > ideas for
> > how we can handle issues with DisplayPort devices like this. For
> > starters, this
> > isn't the first workaround regarding a DisplayPort device or it's
> > connected
> > source device waking up. There's also:
> > 
> > f808f63372cc ("drm/dp_helper: Perform throw-away read before actual
> > read in
> > drm_dp_dpcd_read()")
> > 
> > Which was originally added as a workaround in the Intel driver, and
> > then got
> > moved into the DRM helpers by me. The important thing about these two
> > workarounds that sticks out to me is that they're both issues with DP
> > sinks/hubs
> > that only happen when either the source is first connected to a sink
> > or hub, or
> > when the sink/hub is waking up from a low power state like D3. So it
> > seems like
> > in both of these instances, after we manage one "successful"
> > transaction (where
> > we define "successful" as both an AUX_ACK, -and- the monitor giving
> > us a
> > sensible reply instead of random garbage) then things start to become
> > normal and
> > match up with the spec.
> > 
> > The commonality between these two workarounds makes me think that we
> > could solve
> > the AUX_NACK problem here (-and- this junky throwaway read) if we
> > just kept
> > track of whether or not we've managed a "successful" transaction at
> > least once,
> > after which point we can just immediately abort on any AUX_NACK we
> > receive like
> > we used to. Which would solve the issue you're mentioning here with
> > our handling
> > of AUX_NACKS, without breaking the monitors that actually need those
> > workarounds. First, we would we add a field to drm_dp_aux called
> > "ready". Then,
> > we would want AUX transactions to go like this:
> > 
> > 1. Whenever either of the following events occur:
> >    1a. A new DP device being connected to the system
> >    1b. Bringing an already-connected DP device out of a low power
> > state through
> >        DPCD register 00600h or some other mechanism
> >    We set the "ready" field to False
> > 2. Then, when the driver attempts an AUX channel transaction. We
> > start to
> >    attempt a single DPCD register read from 00000h and retry until
> > that
> >    transaction has completed with AUX_ACK. Take note that we will
> > retry this
> >    transaction a total of 32 times like usual, but we will keep
> > retrying even in
> >    the face of an AUX_NACK.
> > 3. If the aforementioned transaction never completes with AUX_ACK, we
> > consider
> >    the driver's DPCD transaction to have failed and return the
> > appropriate
> >    return code.
> >    However-if the transaction does complete with AUX_ACK, we set the
> > "ready"
> >    field to true.
> > 4. We then attempt the original AUX transaction that the driver
> > requested.
> > 5. For the transaction in 4, and any subsequent transactions, as long
> > as "ready"
> >    is set to True we go with the same 32 retries on AUX_DEFERs, but
> > abort the
> >    transaction immediately on an AUX_NACK.
> 
> Thank you for providing the history for NACK. And Yes this solution
> makes sense. If a single transaction is successful we can abort on all
> subsequent AUX_NACK. I will be happy to give this a try. 
> 
> 
> > 
> > > 4- *AUX_DEFER* The specs stated we may retry 7 times on AUX_DEFER
> > > (3.5.1.2.3 LANEx_CHANNEL_EQ_DONE Sequence) and may terminate LT.
> > > Also
> > > with the increased usage of USB4, TBT/Type-C Docks/Displays, and
> > > active
> > > cables, the use of LTTPR becomes common which adds more challenge
> > > especially if we have multiple LTTPRs and all operate in non-LTTPR
> > > mode. In this case all LTTPRs will reply AUX_DEFER in 300us if it
> > > did
> > > not receive any aux response from other LTTPRs and DPRX. The SCR
> > > states
> > > we need to retry 7 times and not more than 50ms (DP v2.0 SCR on
> > > 8b/10b
> > > DPTX and LTTPR Requirements Update to Section 3.6)
> > 
> > I'm not sure where you're getting 7 retries and not more then 50ms
> > from. I
> > currently have a copy of the DP v2.0 standard, but I'm not sure what
> > SCR is. Is
> > this some sort of update to Section 3.6 from the DP v2.0 standard?
> > Because I see
> > some mentions of 50ms response times in my copy of the 2.0 standard
> > regarding
> > LTTPR, but they don't at all look related to what you're talking
> > about here. (If
> > it is some update I don't have access to, I'll poke the X.org VESA
> > contacts and
> > see if they can get me access to this).
> 
> The link for this SCR (requires VESA memebership): 
> https://groups.vesa.org/wg/Link/document/15764

Alright-I'll contact the VESA folks and see if they can hook me up with this

> 
> 
> > Regardless though, I'm still not sure I understand the issue here. If
> > we're
> > retrying a transaction, it's because the transaction didn't succeed -
> > which in
> > turn means that something went wrong on the DPRX's end. In the event
> > of
> > something going wrong with the DPRX for long enough that we end up
> > exceeding
> > that 50ms timeout, wouldn't that already mean that the link training
> > process is
> > already failing and needs to be aborted? Or are you saying that we
> > would receive
> > AUX_NACKs in such an event, which could cause us to keep retrying the
> > transaction for longer then 50ms, resulting in the DPRX ending the
> > link training
> > prematurely? If it's the former, hopefully the solution I suggested
> > for your
> > third point would end up fixing this anyway (but I'm always open to
> > discussion
> > if that solution isn't enough).
> 
> I will try to explain based on my understanding. 
> The problem happens when we have multiple cascaded LTTPRs between DPTX
> and DPRX and all operate in Non-LTTPR mode. The SCR states the
> following:
> “A DPTX shall retry the same AUX request at least 7 times upon
> receiving AUX_DEFER reply.
> * How many times to retry at most is a DPTX implementation-specific
> choice. However, it shall not retry indefinitely (e.g., longer than
> 50ms) to avoid soft lock-up condition”
>  
> To understand that, assume the following configuration:
>  
> DPTX-----LTTPR4------LTTTPR3------LTTPR2------LTTTPR2------DPRX
>  
> If DPTX didn’t configure LTTPRs 1-4 as transparent or non-transparent,
> all these LTTPRs would operate in non-LTTPR mode. In this mode, the SCR
> states:
> “In case there is no AUX reply transaction to forward within the AUX
> Response Timeout period of 300us, the UFP of an LTTPR in Non-LTTPR Mode
> shall issue an AUX_DEFER reply transaction.”
>  
> That means if:
> 1- DPTX sends AUX request to LTTPR4 which forward to LTTPR3, then
> LTTPR3 forward to LTTPR2 and so on until it is received by DPRX
> 2- While LTTPR4 waiting for reply, if 300us is elapsed. LTTPR4 will
> reply AUX_DEFER
> 3- DPTX will receive AUX_DEFER from LTTPR4 and will resend the AUX
> request.
> 4- As it is a long path, LTTPR4 may reply AUX_DEFER again. Resulting in
> DPTX to resend the AUX request.
> 5- On the same time while LTTPR3 waiting for reply, 300us elapsed,
> resulting LTTPR3 to issue AUX_DEFER to DPTX which means DPTX will
> reissue the AUX request.
> 6. And so on until Aux request is received by DPRX and DPRX starts
> sending the reply. 
>  
> During this journey of Aux request from DPTX to DPRX and AUX reply to
> DPTX,  LTTPR1-4 will keep issuing AUX_DEFER every 300us if they didn’t
> receive any reply on the DFP. And with each AUX_DEFER received, the
> DPTX will resend the Aux request. 
>  
> The SCR is trying to put a limit on this AUX_DEFER as it can go
> indefinitely. 

Hm, so to make sure I'm understanding correctly: the concern here is
just that because of the likliness of an bunch of AUX_DEFERS from an
LTTPR, that we could end up taking longer then 50ms and cause something
to lockup?

Btw - I looked a bit more closely at the code in intel_dp.c. From the
git blame history, the code for doing the retries in intel_dp_aux_xfer()
seems to be quite old. It looks like you could actually just drop the
loop for retries below it:

		/* Must try at least 3 times according to DP spec */
		for (try = 0; try < 5; try++) {

And as long as you make sure that you retain the usleep_range() on
DP_AUX_CH_CTL_TIME_OUT_ERROR (in order to retain the potential delay
between each attempt with different aux_clock_dividers) I don't think
i915 would have any issues, and this would hopefully help us keep within
that 50ms recommended time. As well, we couuld also add something to the
DP helpers so that i915 can indicate that it handles the usleep_range()
delays itself to remove the delay there. We could even add a callbacks
to the DP helpers for once we start retrying/when we end retrying, if
you might need to do something like make sure we hold our CPU latency
qos request for the entirity of the retries that the helpers do.

> 
> 
> > 
> > > In addition I believe this function is not correct in treating
> > > AUX_DEFER and AUX_NACK as -EIO. Especially for AUX_DEFER, it is a
> > > valid
> > > 1 byte response that can provide a valuable debugging information
> > > especially in the case of on-board LTTPR where there is no way to
> > > capture this AUX response between the SoC and LTTPR unless you
> > > solder
> > > two wires on AUX_P and AUX_N lines and use i2c/aux analyzer to
> > > decode.
> > > At least we should provide the same debug information as we do in
> > > drm_dp_i2c_do_msg by printing AUX_DEFER and AUX_NACK.
> > > 
> > > Thank you for your feedback and review.
> > 
> > Is the debugging output for DPCD transactions (e.g. setting bit 0x100
> > for
> > drm.debug on the kernel commandline) not sufficient enough for this
> > kind of
> > debugging? I'm fine with us being more specific with our return codes
> > in that
> > case, I just wonder if they would conflict with any of the error
> > codes some of
> > the DRM DP helpers already return.
> 
> I believe it is not sufficient. Please note for AUX_NACK and AUX_DEFER
> even with 0x100 enabled, we don't know from the kernel logs how many
> times we received NACK or DEFER because drm_dp_dump_access is called
> after drm_dp_dpcd_access in drm_dp_dpcd_read and  drm_dp_dpcd_write. On
> the contrary, in  drm_dp_i2c_do_msg, we can know from kernel logs how
> many times we got NACK or DEFER. 
>  
> This is important if we use an AUX analyzer like Ellisys or DPA-400.
> What the aux analyze captures will be more than what we see in the
> kernel logs. Although the kernel logs should be at least equal to the
> AUX captured by the aux analyzer or more.
>  
> I think we should have the option to indicate how many times we got
> DEFER or NACK. Something like that wihtout even changing the return
> value of this function:

I'd be fine with this, have you looked into just teaching the DRM DP
helpers/callbacks to pass the number of retries that occurred to
drm_dp_dump_access so we could just append it to the current debuggng
output that we have?

>  
>  -269,8 +269,12 @@ static int drm_dp_dpcd_access(struct drm_dp_aux
> *aux, u8 request,
>                                         goto unlock;
>  
>                                 ret = -EPROTO;
> -                       } else
> -                               ret = -EIO;
> +                       } else if (native_reply ==
> DP_AUX_NATIVE_REPLY_DEFER)
> +                               DRM_DEBUG_DP("%s: AUX_DEFER\n", aux-
> > name);
> +                       else if (native_reply ==
> DP_AUX_NATIVE_REPLY_NACK)
> +                               DRM_DEBUG_DP("%s: AUX_NACK\n", aux-
> > name);
> +
> +                       ret = -EIO;
>                 }
> 
> 
> > Anyway-let me know if there's anything in my responses I need to
> > clarify, or
> > anything I missed here. Also, sorry for the very long response! There
> > was a lot
> > of context I had to dump here for this to make sense.
> 
> Thank you so much for your detailed explanation and really appreciate
> you taking the time to reply to me. 
> 
> I apologize for the long email as well :)

Oh it's no problem! When it comes to specifications like these, it's impossible
to get around discussing exact semantics of specifications like this especially
with how easy it is to get them wrong by mistake.

>  
> --Khaled
> 
> > 
> > > --Khaled
> > > > > So making the number of aux retires a variable to allow for
> > > > > fine
> > > > > tuning and
> > > > > optimization of aux timing.
> > > > > 
> > > > > Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com
> > > > > > 
> > > > > ---
> > > > >  drivers/gpu/drm/drm_dp_helper.c | 10 +++-------
> > > > >  include/drm/drm_dp_helper.h     |  1 +
> > > > >  2 files changed, 4 insertions(+), 7 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/drm_dp_helper.c
> > > > > b/drivers/gpu/drm/drm_dp_helper.c
> > > > > index eedbb48815b7..8fdf57b4a06c 100644
> > > > > --- a/drivers/gpu/drm/drm_dp_helper.c
> > > > > +++ b/drivers/gpu/drm/drm_dp_helper.c
> > > > > @@ -249,13 +249,7 @@ static int drm_dp_dpcd_access(struct
> > > > > drm_dp_aux *aux, u8
> > > > > request,
> > > > >  
> > > > >         mutex_lock(&aux->hw_mutex);
> > > > >  
> > > > > -       /*
> > > > > -        * The specification doesn't give any recommendation on
> > > > > how
> > > > > often to
> > > > > -        * retry native transactions. We used to retry 7 times
> > > > > like
> > > > > for
> > > > > -        * aux i2c transactions but real world devices this
> > > > > wasn't
> > > > > -        * sufficient, bump to 32 which makes Dell 4k monitors
> > > > > happier.
> > > > > -        */
> > > > > -       for (retry = 0; retry < 32; retry++) {
> > > > > +       for (retry = 0; retry < aux->num_retries; retry++) {
> > > > >                 if (ret != 0 && ret != -ETIMEDOUT) {
> > > > >                         usleep_range(AUX_RETRY_INTERVAL,
> > > > >                                      AUX_RETRY_INTERVAL + 100);
> > > > > @@ -1744,6 +1738,8 @@ void drm_dp_aux_init(struct drm_dp_aux
> > > > > *aux)
> > > > >         aux->ddc.retries = 3;
> > > > >  
> > > > >         aux->ddc.lock_ops = &drm_dp_i2c_lock_ops;
> > > > > +       /*Still making the Dell 4k monitors happier*/
> > > > > +       aux->num_retries = 32;
> > > > >  }
> > > > >  EXPORT_SYMBOL(drm_dp_aux_init);
> > > > >  
> > > > > diff --git a/include/drm/drm_dp_helper.h
> > > > > b/include/drm/drm_dp_helper.h
> > > > > index edffd1dcca3e..16cbfc8f5e66 100644
> > > > > --- a/include/drm/drm_dp_helper.h
> > > > > +++ b/include/drm/drm_dp_helper.h
> > > > > @@ -1876,6 +1876,7 @@ struct drm_dp_aux {
> > > > >         struct mutex hw_mutex;
> > > > >         struct work_struct crc_work;
> > > > >         u8 crc_count;
> > > > > +       int num_retries;
> > > > >         ssize_t (*transfer)(struct drm_dp_aux *aux,
> > > > >                             struct drm_dp_aux_msg *msg);
> > > > >         /**
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_dp_helper.c b/drivers/gpu/drm/drm_dp_helper.c
index eedbb48815b7..8fdf57b4a06c 100644
--- a/drivers/gpu/drm/drm_dp_helper.c
+++ b/drivers/gpu/drm/drm_dp_helper.c
@@ -249,13 +249,7 @@  static int drm_dp_dpcd_access(struct drm_dp_aux *aux, u8 request,
 
 	mutex_lock(&aux->hw_mutex);
 
-	/*
-	 * The specification doesn't give any recommendation on how often to
-	 * retry native transactions. We used to retry 7 times like for
-	 * aux i2c transactions but real world devices this wasn't
-	 * sufficient, bump to 32 which makes Dell 4k monitors happier.
-	 */
-	for (retry = 0; retry < 32; retry++) {
+	for (retry = 0; retry < aux->num_retries; retry++) {
 		if (ret != 0 && ret != -ETIMEDOUT) {
 			usleep_range(AUX_RETRY_INTERVAL,
 				     AUX_RETRY_INTERVAL + 100);
@@ -1744,6 +1738,8 @@  void drm_dp_aux_init(struct drm_dp_aux *aux)
 	aux->ddc.retries = 3;
 
 	aux->ddc.lock_ops = &drm_dp_i2c_lock_ops;
+	/*Still making the Dell 4k monitors happier*/
+	aux->num_retries = 32;
 }
 EXPORT_SYMBOL(drm_dp_aux_init);
 
diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index edffd1dcca3e..16cbfc8f5e66 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -1876,6 +1876,7 @@  struct drm_dp_aux {
 	struct mutex hw_mutex;
 	struct work_struct crc_work;
 	u8 crc_count;
+	int num_retries;
 	ssize_t (*transfer)(struct drm_dp_aux *aux,
 			    struct drm_dp_aux_msg *msg);
 	/**