diff mbox series

[net] net: stmmac: Stop using a single dma_map() for multiple descriptors

Message ID 20240902095436.3756093-1-quic_jsuraj@quicinc.com (mailing list archive)
State Not Applicable
Headers show
Series [net] net: stmmac: Stop using a single dma_map() for multiple descriptors | expand

Commit Message

Suraj Jaiswal Sept. 2, 2024, 9:54 a.m. UTC
Currently same page address is shared
between multiple buffer addresses and causing smmu fault for other
descriptor if address hold by one descriptor got cleaned.
Allocate separate buffer address for each descriptor
for TSO path so that if one descriptor cleared it should not
clean other descriptor address.

Signed-off-by: Suraj Jaiswal <quic_jsuraj@quicinc.com>
---

Changes since v2:
- Fixed function description 
- Fixed handling of return value.


 .../net/ethernet/stmicro/stmmac/stmmac_main.c | 63 ++++++++++++-------
 1 file changed, 42 insertions(+), 21 deletions(-)

Comments

Andrew Halaney Sept. 3, 2024, 10:17 p.m. UTC | #1
On Mon, Sep 02, 2024 at 03:24:36PM GMT, Suraj Jaiswal wrote:
> Currently same page address is shared
> between multiple buffer addresses and causing smmu fault for other
> descriptor if address hold by one descriptor got cleaned.
> Allocate separate buffer address for each descriptor
> for TSO path so that if one descriptor cleared it should not
> clean other descriptor address.

I think maybe you mean something like:

    Currently in the TSO case a page is mapped with dma_map_single(), and then
    the resulting dma address is referenced (and offset) by multiple
    descriptors until the whole region is programmed into the descriptors.

    This makes it possible for stmmac_tx_clean() to dma_unmap() the first of the
    already processed descriptors, while the rest are still being processed
    by the DMA engine. This leads to an iommu fault due to the DMA engine using
    unmapped memory as seen below:

    <insert splat>

    You can reproduce this easily by <reproduction steps>.

    To fix this, let's map each descriptor's memory reference individually.
    This way there's no risk of unmapping a region that's still being
    referenced by the DMA engine in a later descriptor.

That's a bit nitpicky wording wise, but your first sentence is hard
for me to follow (buffer addresses seems to mean descriptor?). I think
showing a splat and mentioning how to reproduce is always a bonus as
well.

> 
> Signed-off-by: Suraj Jaiswal <quic_jsuraj@quicinc.com>

Fixes: ?

At a quick glance I think its f748be531d70 ("stmmac: support new GMAC4")

> ---
> 
> Changes since v2:
> - Fixed function description 
> - Fixed handling of return value.
> 

This is v1 as far as netdev is concerned :)

> 
>  .../net/ethernet/stmicro/stmmac/stmmac_main.c | 63 ++++++++++++-------
>  1 file changed, 42 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 83b654b7a9fd..5948774c403f 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -4136,16 +4136,18 @@ static bool stmmac_vlan_insert(struct stmmac_priv *priv, struct sk_buff *skb,
>  /**
>   *  stmmac_tso_allocator - close entry point of the driver
>   *  @priv: driver private structure
> - *  @des: buffer start address
> + *  @addr: Contains either skb frag address or skb->data address
>   *  @total_len: total length to fill in descriptors
>   *  @last_segment: condition for the last descriptor
>   *  @queue: TX queue index
> + * @is_skb_frag: condition to check whether skb data is part of fragment or not
>   *  Description:
>   *  This function fills descriptor and request new descriptors according to
>   *  buffer length to fill
> + *  This function returns 0 on success else -ERRNO on fail
>   */
> -static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
> -				 int total_len, bool last_segment, u32 queue)
> +static int stmmac_tso_allocator(struct stmmac_priv *priv, void *addr,
> +				int total_len, bool last_segment, u32 queue, bool is_skb_frag)
>  {
>  	struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue];
>  	struct dma_desc *desc;
> @@ -4153,6 +4155,8 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
>  	int tmp_len;
>  
>  	tmp_len = total_len;
> +	unsigned int offset = 0;
> +	unsigned char *data = addr;

Reverse xmas tree order, offset is always set below so you could just
declare it, and data really doesn't seem necessary to me vs using addr
directly.

https://docs.kernel.org/process/maintainer-netdev.html#local-variable-ordering-reverse-xmas-tree-rcs

>  
>  	while (tmp_len > 0) {
>  		dma_addr_t curr_addr;
> @@ -4161,20 +4165,44 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
>  						priv->dma_conf.dma_tx_size);
>  		WARN_ON(tx_q->tx_skbuff[tx_q->cur_tx]);
>  
> +		buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ? TSO_MAX_BUFF_SIZE : tmp_len;
> +
>  		if (tx_q->tbs & STMMAC_TBS_AVAIL)
>  			desc = &tx_q->dma_entx[tx_q->cur_tx].basic;
>  		else
>  			desc = &tx_q->dma_tx[tx_q->cur_tx];
>  
> -		curr_addr = des + (total_len - tmp_len);
> +		offset = total_len - tmp_len;
> +		if (!is_skb_frag) {
> +			curr_addr = dma_map_single(priv->device, data + offset, buff_size,
> +						   DMA_TO_DEVICE);

Instead of defining "data" above, can't you just use "addr" directly here?

> +
> +			if (dma_mapping_error(priv->device, curr_addr))
> +				return -ENOMEM;
> +
> +			tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = curr_addr;
> +			tx_q->tx_skbuff_dma[tx_q->cur_tx].len = buff_size;
> +			tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = false;
> +			tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
> +		} else {
> +			curr_addr = skb_frag_dma_map(priv->device, addr, offset,
> +						     buff_size,
> +						     DMA_TO_DEVICE);
> +
> +			if (dma_mapping_error(priv->device, curr_addr))
> +				return -ENOMEM;
> +
> +			tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = curr_addr;
> +			tx_q->tx_skbuff_dma[tx_q->cur_tx].len = buff_size;
> +			tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
> +			tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
> +		}
> +
>  		if (priv->dma_cap.addr64 <= 32)
>  			desc->des0 = cpu_to_le32(curr_addr);
>  		else
>  			stmmac_set_desc_addr(priv, desc, curr_addr);
>  
> -		buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ?
> -			    TSO_MAX_BUFF_SIZE : tmp_len;
> -
>  		stmmac_prepare_tso_tx_desc(priv, desc, 0, buff_size,
>  				0, 1,
>  				(last_segment) && (tmp_len <= TSO_MAX_BUFF_SIZE),
> @@ -4182,6 +4210,7 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
>  
>  		tmp_len -= TSO_MAX_BUFF_SIZE;
>  	}
> +	return 0;

nit: add a newline before return 0

>  }
>  
>  static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int queue)
> @@ -4351,25 +4380,17 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
>  		pay_len = 0;
>  	}
>  
> -	stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
> +	if (stmmac_tso_allocator(priv, (skb->data + proto_hdr_len),
> +				 tmp_pay_len, nfrags == 0, queue, false))
> +		goto dma_map_err;

Changing the second argument here is subtly changing the dma_cap.addr64 <= 32
case right before this. Is that intentional?

i.e., prior, pretend des = 0 (side note but des is a very confusing
variable name for "dma address" when there's also mentions of desc meaning
"descriptor" in the DMA ring). In the <= 32 case, we'd call
stmmac_tso_allocator(priv, 0) and in the else case we'd call
stmmac_tso_allocator(priv, 0 + proto_hdr_len).

With this change in both cases its called with the (not-yet-dma-mapped)
skb->data + proto_hdr_len always (i.e. like the else case).

Honestly, the <= 32 case reads weird to me without this patch. It
seems some of the buffer is filled but des is not properly incremented?

I don't know how this hardware is supposed to be programmed (no databook
access) but that seems fishy (and like a separate bug, which would be
nice to squash if so in its own patch). Would you be able to explain the
logic there to me if it does make sense to you?

>  
>  	/* Prepare fragments */
>  	for (i = 0; i < nfrags; i++) {
> -		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> +		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
>  
> -		des = skb_frag_dma_map(priv->device, frag, 0,
> -				       skb_frag_size(frag),
> -				       DMA_TO_DEVICE);
> -		if (dma_mapping_error(priv->device, des))
> +		if (stmmac_tso_allocator(priv, frag, skb_frag_size(frag),
> +					 (i == nfrags - 1), queue, true))

Personally I think it would be nice to change stmmac_tso_allocator() so
you can keep the frag const above... i.e. something like
stmmac_tso_allocator(..., void *addr, ..., const skb_frag_t *frag)
and just check if frag is NULL to determine if you're dealing with a
frag or not (instead of passing the boolean in to indicate that).

I'm curious if someone else can think of a cleaner API than that for
that function, even that's not super pretty...

>  			goto dma_map_err;
> -
> -		stmmac_tso_allocator(priv, des, skb_frag_size(frag),
> -				     (i == nfrags - 1), queue);
> -
> -		tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = des;
> -		tx_q->tx_skbuff_dma[tx_q->cur_tx].len = skb_frag_size(frag);
> -		tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
> -		tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
>  	}
>  
>  	tx_q->tx_skbuff_dma[tx_q->cur_tx].last_segment = true;
> -- 
> 2.25.1
>
Suraj Jaiswal Sept. 10, 2024, 12:47 p.m. UTC | #2
-----Original Message-----
From: Andrew Halaney <ahalaney@redhat.com> 
Sent: Wednesday, September 4, 2024 3:47 AM
To: Suraj Jaiswal (QUIC) <quic_jsuraj@quicinc.com>
Cc: Vinod Koul <vkoul@kernel.org>; bhupesh.sharma@linaro.org; Andy Gross <agross@kernel.org>; Bjorn Andersson <andersson@kernel.org>; Konrad Dybcio <konrad.dybcio@linaro.org>; David S. Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Rob Herring <robh+dt@kernel.org>; Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>; Conor Dooley <conor+dt@kernel.org>; Alexandre Torgue <alexandre.torgue@foss.st.com>; Jose Abreu <joabreu@synopsys.com>; Maxime Coquelin <mcoquelin.stm32@gmail.com>; netdev@vger.kernel.org; linux-arm-msm@vger.kernel.org; devicetree@vger.kernel.org; linux-kernel@vger.kernel.org; linux-stm32@st-md-mailman.stormreply.com; Prasad Sodagudi <psodagud@quicinc.com>; Rob Herring <robh@kernel.org>; kernel <kernel@quicinc.com>
Subject: Re: [PATCH net] net: stmmac: Stop using a single dma_map() for multiple descriptors

WARNING: This email originated from outside of Qualcomm. Please be wary of any links or attachments, and do not enable macros.

On Mon, Sep 02, 2024 at 03:24:36PM GMT, Suraj Jaiswal wrote:
> Currently same page address is shared
> between multiple buffer addresses and causing smmu fault for other 
> descriptor if address hold by one descriptor got cleaned.
> Allocate separate buffer address for each descriptor for TSO path so 
> that if one descriptor cleared it should not clean other descriptor 
> address.

I think maybe you mean something like:

    Currently in the TSO case a page is mapped with dma_map_single(), and then
    the resulting dma address is referenced (and offset) by multiple
    descriptors until the whole region is programmed into the descriptors.

    This makes it possible for stmmac_tx_clean() to dma_unmap() the first of the
    already processed descriptors, while the rest are still being processed
    by the DMA engine. This leads to an iommu fault due to the DMA engine using
    unmapped memory as seen below:

    <insert splat>

    You can reproduce this easily by <reproduction steps>.

    To fix this, let's map each descriptor's memory reference individually.
    This way there's no risk of unmapping a region that's still being
    referenced by the DMA engine in a later descriptor.

That's a bit nitpicky wording wise, but your first sentence is hard for me to follow (buffer addresses seems to mean descriptor?). I think showing a splat and mentioning how to reproduce is always a bonus as well.

>
> Signed-off-by: Suraj Jaiswal <quic_jsuraj@quicinc.com>

Fixes: ?

At a quick glance I think its f748be531d70 ("stmmac: support new GMAC4")

> ---
>
> Changes since v2:
> - Fixed function description
> - Fixed handling of return value.
>

This is v1 as far as netdev is concerned :)

>
>  .../net/ethernet/stmicro/stmmac/stmmac_main.c | 63 
> ++++++++++++-------
>  1 file changed, 42 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
> b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 83b654b7a9fd..5948774c403f 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -4136,16 +4136,18 @@ static bool stmmac_vlan_insert(struct 
> stmmac_priv *priv, struct sk_buff *skb,
>  /**
>   *  stmmac_tso_allocator - close entry point of the driver
>   *  @priv: driver private structure
> - *  @des: buffer start address
> + *  @addr: Contains either skb frag address or skb->data address
>   *  @total_len: total length to fill in descriptors
>   *  @last_segment: condition for the last descriptor
>   *  @queue: TX queue index
> + * @is_skb_frag: condition to check whether skb data is part of 
> + fragment or not
>   *  Description:
>   *  This function fills descriptor and request new descriptors according to
>   *  buffer length to fill
> + *  This function returns 0 on success else -ERRNO on fail
>   */
> -static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
> -                              int total_len, bool last_segment, u32 queue)
> +static int stmmac_tso_allocator(struct stmmac_priv *priv, void *addr,
> +                             int total_len, bool last_segment, u32 
> +queue, bool is_skb_frag)
>  {
>       struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue];
>       struct dma_desc *desc;
> @@ -4153,6 +4155,8 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
>       int tmp_len;
>
>       tmp_len = total_len;
> +     unsigned int offset = 0;
> +     unsigned char *data = addr;

Reverse xmas tree order, offset is always set below so you could just declare it, and data really doesn't seem necessary to me vs using addr directly.
<Suraj> done.

https://docs.kernel.org/process/maintainer-netdev.html#local-variable-ordering-reverse-xmas-tree-rcs

>
>       while (tmp_len > 0) {
>               dma_addr_t curr_addr;
> @@ -4161,20 +4165,44 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
>                                               priv->dma_conf.dma_tx_size);
>               WARN_ON(tx_q->tx_skbuff[tx_q->cur_tx]);
>
> +             buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ? 
> + TSO_MAX_BUFF_SIZE : tmp_len;
> +
>               if (tx_q->tbs & STMMAC_TBS_AVAIL)
>                       desc = &tx_q->dma_entx[tx_q->cur_tx].basic;
>               else
>                       desc = &tx_q->dma_tx[tx_q->cur_tx];
>
> -             curr_addr = des + (total_len - tmp_len);
> +             offset = total_len - tmp_len;
> +             if (!is_skb_frag) {
> +                     curr_addr = dma_map_single(priv->device, data + offset, buff_size,
> +                                                DMA_TO_DEVICE);

Instead of defining "data" above, can't you just use "addr" directly here?

> +
> +                     if (dma_mapping_error(priv->device, curr_addr))
> +                             return -ENOMEM;
> +
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = curr_addr;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].len = buff_size;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = false;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
> +             } else {
> +                     curr_addr = skb_frag_dma_map(priv->device, addr, offset,
> +                                                  buff_size,
> +                                                  DMA_TO_DEVICE);
> +
> +                     if (dma_mapping_error(priv->device, curr_addr))
> +                             return -ENOMEM;
> +
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = curr_addr;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].len = buff_size;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
> +             }
> +
>               if (priv->dma_cap.addr64 <= 32)
>                       desc->des0 = cpu_to_le32(curr_addr);
>               else
>                       stmmac_set_desc_addr(priv, desc, curr_addr);
>
> -             buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ?
> -                         TSO_MAX_BUFF_SIZE : tmp_len;
> -
>               stmmac_prepare_tso_tx_desc(priv, desc, 0, buff_size,
>                               0, 1,
>                               (last_segment) && (tmp_len <= 
> TSO_MAX_BUFF_SIZE), @@ -4182,6 +4210,7 @@ static void 
> stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
>
>               tmp_len -= TSO_MAX_BUFF_SIZE;
>       }
> +     return 0;

nit: add a newline before return 0

>  }
>
>  static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int 
> queue) @@ -4351,25 +4380,17 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
>               pay_len = 0;
>       }
>
> -     stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
> +     if (stmmac_tso_allocator(priv, (skb->data + proto_hdr_len),
> +                              tmp_pay_len, nfrags == 0, queue, false))
> +             goto dma_map_err;

Changing the second argument here is subtly changing the dma_cap.addr64 <= 32 case right before this. Is that intentional?

i.e., prior, pretend des = 0 (side note but des is a very confusing variable name for "dma address" when there's also mentions of desc meaning "descriptor" in the DMA ring). In the <= 32 case, we'd call stmmac_tso_allocator(priv, 0) and in the else case we'd call stmmac_tso_allocator(priv, 0 + proto_hdr_len).

With this change in both cases its called with the (not-yet-dma-mapped)
skb->data + proto_hdr_len always (i.e. like the else case).

Honestly, the <= 32 case reads weird to me without this patch. It seems some of the buffer is filled but des is not properly incremented?

I don't know how this hardware is supposed to be programmed (no databook
access) but that seems fishy (and like a separate bug, which would be nice to squash if so in its own patch). Would you be able to explain the logic there to me if it does make sense to you?

<Suraj> des can not be 0 . des 0 means dma_map_single() failed and it will return.
If we see if des calculation (first->des1 = cpu_to_le32(des + proto_hdr_len);) and else case des calculator ( des += proto_hdr_len;) it is adding proto_hdr_len to the memory that we after mapping skb->data using dma_map_single. Same way we added proto_hdr_len in second argument . 

>
>       /* Prepare fragments */
>       for (i = 0; i < nfrags; i++) {
> -             const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> +             skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
>
> -             des = skb_frag_dma_map(priv->device, frag, 0,
> -                                    skb_frag_size(frag),
> -                                    DMA_TO_DEVICE);
> -             if (dma_mapping_error(priv->device, des))
> +             if (stmmac_tso_allocator(priv, frag, skb_frag_size(frag),
> +                                      (i == nfrags - 1), queue, 
> + true))

Personally I think it would be nice to change stmmac_tso_allocator() so you can keep the frag const above... i.e. something like stmmac_tso_allocator(..., void *addr, ..., const skb_frag_t *frag) and just check if frag is NULL to determine if you're dealing with a frag or not (instead of passing the boolean in to indicate that).

I'm curious if someone else can think of a cleaner API than that for that function, even that's not super pretty...

>                       goto dma_map_err;
> -
> -             stmmac_tso_allocator(priv, des, skb_frag_size(frag),
> -                                  (i == nfrags - 1), queue);
> -
> -             tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = des;
> -             tx_q->tx_skbuff_dma[tx_q->cur_tx].len = skb_frag_size(frag);
> -             tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
> -             tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
>       }
>
>       tx_q->tx_skbuff_dma[tx_q->cur_tx].last_segment = true;
> --
> 2.25.1
>
Andrew Halaney Sept. 10, 2024, 2:04 p.m. UTC | #3
Hey Suraj,

Your email client didn't seem to quote my response in your latest reply,
so its difficult to figure out what you're writing vs me below. It also
seems to have messed with the line breaks so I'm manually redoing those.

Please see if you can figure out how to make that happen for further
replies!

More comments below...

On Tue, Sep 10, 2024 at 12:47:08PM GMT, Suraj Jaiswal wrote:
> 
> 
> -----Original Message-----
> From: Andrew Halaney <ahalaney@redhat.com> 
> Sent: Wednesday, September 4, 2024 3:47 AM
> To: Suraj Jaiswal (QUIC) <quic_jsuraj@quicinc.com>
> Cc: Vinod Koul <vkoul@kernel.org>; bhupesh.sharma@linaro.org; Andy Gross <agross@kernel.org>; Bjorn Andersson <andersson@kernel.org>; Konrad Dybcio <konrad.dybcio@linaro.org>; David S. Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Rob Herring <robh+dt@kernel.org>; Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>; Conor Dooley <conor+dt@kernel.org>; Alexandre Torgue <alexandre.torgue@foss.st.com>; Jose Abreu <joabreu@synopsys.com>; Maxime Coquelin <mcoquelin.stm32@gmail.com>; netdev@vger.kernel.org; linux-arm-msm@vger.kernel.org; devicetree@vger.kernel.org; linux-kernel@vger.kernel.org; linux-stm32@st-md-mailman.stormreply.com; Prasad Sodagudi <psodagud@quicinc.com>; Rob Herring <robh@kernel.org>; kernel <kernel@quicinc.com>
> Subject: Re: [PATCH net] net: stmmac: Stop using a single dma_map() for multiple descriptors
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary of any links or attachments, and do not enable macros.
> 
> On Mon, Sep 02, 2024 at 03:24:36PM GMT, Suraj Jaiswal wrote:
> > Currently same page address is shared
> > between multiple buffer addresses and causing smmu fault for other 
> > descriptor if address hold by one descriptor got cleaned.
> > Allocate separate buffer address for each descriptor for TSO path so 
> > that if one descriptor cleared it should not clean other descriptor 
> > address.

snip...

> >
> >  static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int 
> > queue) @@ -4351,25 +4380,17 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
> >               pay_len = 0;
> >       }
> >
> > -     stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
> > +     if (stmmac_tso_allocator(priv, (skb->data + proto_hdr_len),
> > +                              tmp_pay_len, nfrags == 0, queue, false))
> > +             goto dma_map_err;
> 
> Changing the second argument here is subtly changing the dma_cap.addr64 <= 32
> case right before this. Is that intentional?
> 
> i.e., prior, pretend des = 0 (side note but des is a very confusing variable
> name for "dma address" when there's also mentions of desc meaning "descriptor"
> in the DMA ring). In the <= 32 case, we'd call stmmac_tso_allocator(priv, 0)
> and in the else case we'd call stmmac_tso_allocator(priv, 0 + proto_hdr_len).
> 
> With this change in both cases its called with the (not-yet-dma-mapped)
> skb->data + proto_hdr_len always (i.e. like the else case).
> 
> Honestly, the <= 32 case reads weird to me without this patch. It seems some
> of the buffer is filled but des is not properly incremented?
> 
> I don't know how this hardware is supposed to be programmed (no databook
> access) but that seems fishy (and like a separate bug, which would be nice to
> squash if so in its own patch). Would you be able to explain the logic there
> to me if it does make sense to you?
> 

> <Suraj> des can not be 0 . des 0 means dma_map_single() failed and it will return.
> If we see if des calculation (first->des1 = cpu_to_le32(des + proto_hdr_len);)
> and else case des calculator ( des += proto_hdr_len;) it is adding proto_hdr_len
> to the memory that we after mapping skb->data using dma_map_single.
> Same way we added proto_hdr_len in second argument . 


0 was just an example, and a confusing one, sorry. Let me paste the original
fishy code that I think you've modified the behavior for. Here's the
original:

	if (priv->dma_cap.addr64 <= 32) {
		first->des0 = cpu_to_le32(des);

		/* Fill start of payload in buff2 of first descriptor */
		if (pay_len)
			first->des1 = cpu_to_le32(des + proto_hdr_len);

		/* If needed take extra descriptors to fill the remaining payload */
		tmp_pay_len = pay_len - TSO_MAX_BUFF_SIZE;
	} else {
		stmmac_set_desc_addr(priv, first, des);
		tmp_pay_len = pay_len;
		des += proto_hdr_len;
		pay_len = 0;
	}

	stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);

Imagine the <= 32 case. Let's say des is address 0 (just for simplicity
sake, let's assume that's valid). That means:

    first->des0 = des;
    first->des1 = des + proto_hdr_len;
    stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue)

    if des is 0, proto_hdr_len is 64, then that means

    first->des0 = 0
    first->des1 = 64
    stmmac_tso_allocator(priv, 0, tmp_pay_len, (nfrags == 0), queue)

That seems fishy to me. We setup up the first descriptor with the
beginning of des, and then the code goes and sets up more descriptors
(stmmac_tso_allocator()) starting with the same des again?

Should we be adding the payload length (TSO_MAX_BUFF_SIZE I suppose
based on the tmp_pay_len = pay_len - TSO_MAX_BUFFSIZE above)? It seems
that <= 32 results in duplicate data in both the "first" descriptor
programmed above, and in the "second" descriptor programmed in
stmmac_tso_allocator(). Also, since tmp_pay_len is decremented, but des
isn't, it seems that stmmac_tso_allocator() would not put all of the
buffer in the descriptors and would leave the last TSO_MAX_BUFF_SIZE
bytes out?

I highlight all of this because with your change here we get the
following now in the <= 32 case:

    first->des0 = des
    first->des1 = des + proto_hdr_len
    stmmac_tso_allocator(priv, des + proto_hdr_len, ...)

which is a subtle change in the call to stmmac_tso_allocator, meaning
a subtle change in the descriptor programming.

Both seem wrong for the <= 32 case, but I'm "reading between the lines"
with how these descriptors are programmed (I don't have the docs to back
this up, I'm inferring from the code). It seems to me that in the <= 32
case we should have:

    first->des0 = des
    first->des1 = des + proto_hdr_len
    stmmac_tso_allocator(priv, des + TSO_MAX_BUF_SIZE, ...)

or similar depending on if that really makes sense with how des0/des1 is
used (the handling is different in stmmac_tso_allocator() for <= 32,
only des0 is used so I'm having a tough time figuring out how much of
the des is actually programmed in des0 + des1 above without knowing the
hardware better).

Does that make sense? The prior code seems fishy to me, your change
seems to unintentionally change that fhsy part, but it still seems fishy
to me. I don't think you should be changing that code's behavior in that
patch, if you think it's right then we should continue with the current
behavior prior to your patch, and if you think its wrong we should
probably fix that *prior* to this patch in your series.

Thanks,
Andrew
Suraj Jaiswal Sept. 10, 2024, 3:43 p.m. UTC | #4
-----Original Message-----
From: Andrew Halaney <ahalaney@redhat.com> 
Sent: Wednesday, September 4, 2024 3:47 AM
To: Suraj Jaiswal (QUIC) <quic_jsuraj@quicinc.com>
Cc: Vinod Koul <vkoul@kernel.org>; bhupesh.sharma@linaro.org; Andy Gross <agross@kernel.org>; Bjorn Andersson <andersson@kernel.org>; Konrad Dybcio <konrad.dybcio@linaro.org>; David S. Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Rob Herring <robh+dt@kernel.org>; Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>; Conor Dooley <conor+dt@kernel.org>; Alexandre Torgue <alexandre.torgue@foss.st.com>; Jose Abreu <joabreu@synopsys.com>; Maxime Coquelin <mcoquelin.stm32@gmail.com>; netdev@vger.kernel.org; linux-arm-msm@vger.kernel.org; devicetree@vger.kernel.org; linux-kernel@vger.kernel.org; linux-stm32@st-md-mailman.stormreply.com; Prasad Sodagudi <psodagud@quicinc.com>; Rob Herring <robh@kernel.org>; kernel <kernel@quicinc.com>
Subject: Re: [PATCH net] net: stmmac: Stop using a single dma_map() for multiple descriptors

WARNING: This email originated from outside of Qualcomm. Please be wary of any links or attachments, and do not enable macros.

On Mon, Sep 02, 2024 at 03:24:36PM GMT, Suraj Jaiswal wrote:
> Currently same page address is shared
> between multiple buffer addresses and causing smmu fault for other 
> descriptor if address hold by one descriptor got cleaned.
> Allocate separate buffer address for each descriptor for TSO path so 
> that if one descriptor cleared it should not clean other descriptor 
> address.

I think maybe you mean something like:

    Currently in the TSO case a page is mapped with dma_map_single(), and then
    the resulting dma address is referenced (and offset) by multiple
    descriptors until the whole region is programmed into the descriptors.

    This makes it possible for stmmac_tx_clean() to dma_unmap() the first of the
    already processed descriptors, while the rest are still being processed
    by the DMA engine. This leads to an iommu fault due to the DMA engine using
    unmapped memory as seen below:

    <insert splat>

    You can reproduce this easily by <reproduction steps>.

    To fix this, let's map each descriptor's memory reference individually.
    This way there's no risk of unmapping a region that's still being
    referenced by the DMA engine in a later descriptor.

That's a bit nitpicky wording wise, but your first sentence is hard for me to follow (buffer addresses seems to mean descriptor?). I think showing a splat and mentioning how to reproduce is always a bonus as well.

>
> Signed-off-by: Suraj Jaiswal <quic_jsuraj@quicinc.com>

Fixes: ?

At a quick glance I think its f748be531d70 ("stmmac: support new GMAC4")

> ---
>
> Changes since v2:
> - Fixed function description
> - Fixed handling of return value.
>

This is v1 as far as netdev is concerned :)

>
>  .../net/ethernet/stmicro/stmmac/stmmac_main.c | 63 
> ++++++++++++-------
>  1 file changed, 42 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
> b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 83b654b7a9fd..5948774c403f 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -4136,16 +4136,18 @@ static bool stmmac_vlan_insert(struct 
> stmmac_priv *priv, struct sk_buff *skb,
>  /**
>   *  stmmac_tso_allocator - close entry point of the driver
>   *  @priv: driver private structure
> - *  @des: buffer start address
> + *  @addr: Contains either skb frag address or skb->data address
>   *  @total_len: total length to fill in descriptors
>   *  @last_segment: condition for the last descriptor
>   *  @queue: TX queue index
> + * @is_skb_frag: condition to check whether skb data is part of 
> + fragment or not
>   *  Description:
>   *  This function fills descriptor and request new descriptors according to
>   *  buffer length to fill
> + *  This function returns 0 on success else -ERRNO on fail
>   */
> -static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
> -                              int total_len, bool last_segment, u32 queue)
> +static int stmmac_tso_allocator(struct stmmac_priv *priv, void *addr,
> +                             int total_len, bool last_segment, u32 
> +queue, bool is_skb_frag)
>  {
>       struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue];
>       struct dma_desc *desc;
> @@ -4153,6 +4155,8 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
>       int tmp_len;
>
>       tmp_len = total_len;
> +     unsigned int offset = 0;
> +     unsigned char *data = addr;

Reverse xmas tree order, offset is always set below so you could just declare it, and data really doesn't seem necessary to me vs using addr directly.

https://docs.kernel.org/process/maintainer-netdev.html#local-variable-ordering-reverse-xmas-tree-rcs

>
>       while (tmp_len > 0) {
>               dma_addr_t curr_addr;
> @@ -4161,20 +4165,44 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
>                                               priv->dma_conf.dma_tx_size);
>               WARN_ON(tx_q->tx_skbuff[tx_q->cur_tx]);
>
> +             buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ? 
> + TSO_MAX_BUFF_SIZE : tmp_len;
> +
>               if (tx_q->tbs & STMMAC_TBS_AVAIL)
>                       desc = &tx_q->dma_entx[tx_q->cur_tx].basic;
>               else
>                       desc = &tx_q->dma_tx[tx_q->cur_tx];
>
> -             curr_addr = des + (total_len - tmp_len);
> +             offset = total_len - tmp_len;
> +             if (!is_skb_frag) {
> +                     curr_addr = dma_map_single(priv->device, data + offset, buff_size,
> +                                                DMA_TO_DEVICE);

Instead of defining "data" above, can't you just use "addr" directly here?
<suraj> addr is void point . we are using data to convert into char * 

> +
> +                     if (dma_mapping_error(priv->device, curr_addr))
> +                             return -ENOMEM;
> +
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = curr_addr;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].len = buff_size;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = false;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
> +             } else {
> +                     curr_addr = skb_frag_dma_map(priv->device, addr, offset,
> +                                                  buff_size,
> +                                                  DMA_TO_DEVICE);
> +
> +                     if (dma_mapping_error(priv->device, curr_addr))
> +                             return -ENOMEM;
> +
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = curr_addr;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].len = buff_size;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
> +                     tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
> +             }
> +
>               if (priv->dma_cap.addr64 <= 32)
>                       desc->des0 = cpu_to_le32(curr_addr);
>               else
>                       stmmac_set_desc_addr(priv, desc, curr_addr);
>
> -             buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ?
> -                         TSO_MAX_BUFF_SIZE : tmp_len;
> -
>               stmmac_prepare_tso_tx_desc(priv, desc, 0, buff_size,
>                               0, 1,
>                               (last_segment) && (tmp_len <= 
> TSO_MAX_BUFF_SIZE), @@ -4182,6 +4210,7 @@ static void 
> stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
>
>               tmp_len -= TSO_MAX_BUFF_SIZE;
>       }
> +     return 0;

nit: add a newline before return 0

>  }
>
>  static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int 
> queue) @@ -4351,25 +4380,17 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
>               pay_len = 0;
>       }
>
> -     stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
> +     if (stmmac_tso_allocator(priv, (skb->data + proto_hdr_len),
> +                              tmp_pay_len, nfrags == 0, queue, false))
> +             goto dma_map_err;

Changing the second argument here is subtly changing the dma_cap.addr64 <= 32 case right before this. Is that intentional?

i.e., prior, pretend des = 0 (side note but des is a very confusing variable name for "dma address" when there's also mentions of desc meaning "descriptor" in the DMA ring). In the <= 32 case, we'd call stmmac_tso_allocator(priv, 0) and in the else case we'd call stmmac_tso_allocator(priv, 0 + proto_hdr_len).

With this change in both cases its called with the (not-yet-dma-mapped)
skb->data + proto_hdr_len always (i.e. like the else case).

Honestly, the <= 32 case reads weird to me without this patch. It seems some of the buffer is filled but des is not properly incremented?

I don't know how this hardware is supposed to be programmed (no databook
access) but that seems fishy (and like a separate bug, which would be nice to squash if so in its own patch). Would you be able to explain the logic there to me if it does make sense to you?

>
>       /* Prepare fragments */
>       for (i = 0; i < nfrags; i++) {
> -             const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> +             skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
>
> -             des = skb_frag_dma_map(priv->device, frag, 0,
> -                                    skb_frag_size(frag),
> -                                    DMA_TO_DEVICE);
> -             if (dma_mapping_error(priv->device, des))
> +             if (stmmac_tso_allocator(priv, frag, skb_frag_size(frag),
> +                                      (i == nfrags - 1), queue, 
> + true))

Personally I think it would be nice to change stmmac_tso_allocator() so you can keep the frag const above... i.e. something like stmmac_tso_allocator(..., void *addr, ..., const skb_frag_t *frag) and just check if frag is NULL to determine if you're dealing with a frag or not (instead of passing the boolean in to indicate that).

I'm curious if someone else can think of a cleaner API than that for that function, even that's not super pretty...

>                       goto dma_map_err;
> -
> -             stmmac_tso_allocator(priv, des, skb_frag_size(frag),
> -                                  (i == nfrags - 1), queue);
> -
> -             tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = des;
> -             tx_q->tx_skbuff_dma[tx_q->cur_tx].len = skb_frag_size(frag);
> -             tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
> -             tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
>       }
>
>       tx_q->tx_skbuff_dma[tx_q->cur_tx].last_segment = true;
> --
> 2.25.1
>
Andrew Lunn Sept. 10, 2024, 4:04 p.m. UTC | #5
> On Mon, Sep 02, 2024 at 03:24:36PM GMT, Suraj Jaiswal wrote:
> > Currently same page address is shared
> > between multiple buffer addresses and causing smmu fault for other 
> > descriptor if address hold by one descriptor got cleaned.
> > Allocate separate buffer address for each descriptor for TSO path so 
> > that if one descriptor cleared it should not clean other descriptor 
> > address.
> 
> I think maybe you mean something like:
> 
>     Currently in the TSO case a page is mapped with dma_map_single(), and then
>     the resulting dma address is referenced (and offset) by multiple
>     descriptors until the whole region is programmed into the descriptors.
> 
>     This makes it possible for stmmac_tx_clean() to dma_unmap() the first of the
>     already processed descriptors, while the rest are still being processed
>     by the DMA engine. This leads to an iommu fault due to the DMA engine using
>     unmapped memory as seen below:
> 
>     <insert splat>
> 
>     You can reproduce this easily by <reproduction steps>.
> 
>     To fix this, let's map each descriptor's memory reference individually.
>     This way there's no risk of unmapping a region that's still being
>     referenced by the DMA engine in a later descriptor.
> 
> That's a bit nitpicky wording wise, but your first sentence is hard for me to follow (buffer addresses seems to mean descriptor?). I think showing a splat and mentioning how to reproduce is always a bonus as well.

What might also be interesting is some benchmark
numbers. dma_map_single() is not always cheap, since it has to flush
the cache. I've no idea if only the first call is expensive, and all
the subsequent calls are cheap? Depending on how expensive it is, some
sort of refcounting might be cheaper?

     Andrew
Dmitry Baryshkov Sept. 11, 2024, 8:55 a.m. UTC | #6
On Tue, Sep 10, 2024 at 03:43:23PM GMT, Suraj Jaiswal wrote:
> 
> 
> -----Original Message-----
> From: Andrew Halaney <ahalaney@redhat.com> 
> Sent: Wednesday, September 4, 2024 3:47 AM
> To: Suraj Jaiswal (QUIC) <quic_jsuraj@quicinc.com>
> Cc: Vinod Koul <vkoul@kernel.org>; bhupesh.sharma@linaro.org; Andy Gross <agross@kernel.org>; Bjorn Andersson <andersson@kernel.org>; Konrad Dybcio <konrad.dybcio@linaro.org>; David S. Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Rob Herring <robh+dt@kernel.org>; Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>; Conor Dooley <conor+dt@kernel.org>; Alexandre Torgue <alexandre.torgue@foss.st.com>; Jose Abreu <joabreu@synopsys.com>; Maxime Coquelin <mcoquelin.stm32@gmail.com>; netdev@vger.kernel.org; linux-arm-msm@vger.kernel.org; devicetree@vger.kernel.org; linux-kernel@vger.kernel.org; linux-stm32@st-md-mailman.stormreply.com; Prasad Sodagudi <psodagud@quicinc.com>; Rob Herring <robh@kernel.org>; kernel <kernel@quicinc.com>
> Subject: Re: [PATCH net] net: stmmac: Stop using a single dma_map() for multiple descriptors
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary of any links or attachments, and do not enable macros.
> 
> On Mon, Sep 02, 2024 at 03:24:36PM GMT, Suraj Jaiswal wrote:
> > Currently same page address is shared
> > between multiple buffer addresses and causing smmu fault for other 
> > descriptor if address hold by one descriptor got cleaned.
> > Allocate separate buffer address for each descriptor for TSO path so 
> > that if one descriptor cleared it should not clean other descriptor 
> > address.
> 
> I think maybe you mean something like:
> 
>     Currently in the TSO case a page is mapped with dma_map_single(), and then
>     the resulting dma address is referenced (and offset) by multiple
>     descriptors until the whole region is programmed into the descriptors.
> 
>     This makes it possible for stmmac_tx_clean() to dma_unmap() the first of the
>     already processed descriptors, while the rest are still being processed
>     by the DMA engine. This leads to an iommu fault due to the DMA engine using
>     unmapped memory as seen below:
> 
>     <insert splat>
> 
>     You can reproduce this easily by <reproduction steps>.
> 
>     To fix this, let's map each descriptor's memory reference individually.
>     This way there's no risk of unmapping a region that's still being
>     referenced by the DMA engine in a later descriptor.
> 
> That's a bit nitpicky wording wise, but your first sentence is hard for me to follow (buffer addresses seems to mean descriptor?). I think showing a splat and mentioning how to reproduce is always a bonus as well.

Please fix your email client. It is impossible to understand where is
your answer and where comes the quoted text by Andrew. Use text emails,
text quotation (single '>') and no Outlook splat at the top of the email
("Original Message" with all the emails, etc).
Sarosh Hasan Sept. 24, 2024, 11:06 a.m. UTC | #7
On 9/10/2024 7:34 PM, Andrew Halaney wrote:
> Hey Suraj,
> 
> Your email client didn't seem to quote my response in your latest reply,
> so its difficult to figure out what you're writing vs me below. It also
> seems to have messed with the line breaks so I'm manually redoing those.
> 
> Please see if you can figure out how to make that happen for further
> replies!
> 
> More comments below...
> 
> On Tue, Sep 10, 2024 at 12:47:08PM GMT, Suraj Jaiswal wrote:
>>
>>
>> -----Original Message-----
>> From: Andrew Halaney <ahalaney@redhat.com> 
>> Sent: Wednesday, September 4, 2024 3:47 AM
>> To: Suraj Jaiswal (QUIC) <quic_jsuraj@quicinc.com>
>> Cc: Vinod Koul <vkoul@kernel.org>; bhupesh.sharma@linaro.org; Andy Gross <agross@kernel.org>; Bjorn Andersson <andersson@kernel.org>; Konrad Dybcio <konrad.dybcio@linaro.org>; David S. Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Rob Herring <robh+dt@kernel.org>; Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>; Conor Dooley <conor+dt@kernel.org>; Alexandre Torgue <alexandre.torgue@foss.st.com>; Jose Abreu <joabreu@synopsys.com>; Maxime Coquelin <mcoquelin.stm32@gmail.com>; netdev@vger.kernel.org; linux-arm-msm@vger.kernel.org; devicetree@vger.kernel.org; linux-kernel@vger.kernel.org; linux-stm32@st-md-mailman.stormreply.com; Prasad Sodagudi <psodagud@quicinc.com>; Rob Herring <robh@kernel.org>; kernel <kernel@quicinc.com>
>> Subject: Re: [PATCH net] net: stmmac: Stop using a single dma_map() for multiple descriptors
>>
>> WARNING: This email originated from outside of Qualcomm. Please be wary of any links or attachments, and do not enable macros.
>>
>> On Mon, Sep 02, 2024 at 03:24:36PM GMT, Suraj Jaiswal wrote:
>>> Currently same page address is shared
>>> between multiple buffer addresses and causing smmu fault for other 
>>> descriptor if address hold by one descriptor got cleaned.
>>> Allocate separate buffer address for each descriptor for TSO path so 
>>> that if one descriptor cleared it should not clean other descriptor 
>>> address.
> 
> snip...
> 
>>>
>>>  static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int 
>>> queue) @@ -4351,25 +4380,17 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
>>>               pay_len = 0;
>>>       }
>>>
>>> -     stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
>>> +     if (stmmac_tso_allocator(priv, (skb->data + proto_hdr_len),
>>> +                              tmp_pay_len, nfrags == 0, queue, false))
>>> +             goto dma_map_err;
>>
>> Changing the second argument here is subtly changing the dma_cap.addr64 <= 32
>> case right before this. Is that intentional?
>>
>> i.e., prior, pretend des = 0 (side note but des is a very confusing variable
>> name for "dma address" when there's also mentions of desc meaning "descriptor"
>> in the DMA ring). In the <= 32 case, we'd call stmmac_tso_allocator(priv, 0)
>> and in the else case we'd call stmmac_tso_allocator(priv, 0 + proto_hdr_len).
>>
>> With this change in both cases its called with the (not-yet-dma-mapped)
>> skb->data + proto_hdr_len always (i.e. like the else case).
>>
>> Honestly, the <= 32 case reads weird to me without this patch. It seems some
>> of the buffer is filled but des is not properly incremented?
>>
>> I don't know how this hardware is supposed to be programmed (no databook
>> access) but that seems fishy (and like a separate bug, which would be nice to
>> squash if so in its own patch). Would you be able to explain the logic there
>> to me if it does make sense to you?
>>
> 
>> <Suraj> des can not be 0 . des 0 means dma_map_single() failed and it will return.
>> If we see if des calculation (first->des1 = cpu_to_le32(des + proto_hdr_len);)
>> and else case des calculator ( des += proto_hdr_len;) it is adding proto_hdr_len
>> to the memory that we after mapping skb->data using dma_map_single.
>> Same way we added proto_hdr_len in second argument . 
> 
> 
> 0 was just an example, and a confusing one, sorry. Let me paste the original
> fishy code that I think you've modified the behavior for. Here's the
> original:
> 
> 	if (priv->dma_cap.addr64 <= 32) {
> 		first->des0 = cpu_to_le32(des);
> 
> 		/* Fill start of payload in buff2 of first descriptor */
> 		if (pay_len)
> 			first->des1 = cpu_to_le32(des + proto_hdr_len);
> 
> 		/* If needed take extra descriptors to fill the remaining payload */
> 		tmp_pay_len = pay_len - TSO_MAX_BUFF_SIZE;
> 	} else {
> 		stmmac_set_desc_addr(priv, first, des);
> 		tmp_pay_len = pay_len;
> 		des += proto_hdr_len;
> 		pay_len = 0;
> 	}
> 
> 	stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
> 
> Imagine the <= 32 case. Let's say des is address 0 (just for simplicity
> sake, let's assume that's valid). That means:
> 
>     first->des0 = des;
>     first->des1 = des + proto_hdr_len;
>     stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue)
> 
>     if des is 0, proto_hdr_len is 64, then that means
> 
>     first->des0 = 0
>     first->des1 = 64
>     stmmac_tso_allocator(priv, 0, tmp_pay_len, (nfrags == 0), queue)
> 
> That seems fishy to me. We setup up the first descriptor with the
> beginning of des, and then the code goes and sets up more descriptors
> (stmmac_tso_allocator()) starting with the same des again?
tso_alloc is checking if more descriptor needed for packet . it is adding offset to get next
descriptor (curr_addr = des + (total_len - tmp_len)) and storing in des of next descriptor.
> 
> Should we be adding the payload length (TSO_MAX_BUFF_SIZE I suppose
> based on the tmp_pay_len = pay_len - TSO_MAX_BUFFSIZE above)? It seems
> that <= 32 results in duplicate data in both the "first" descriptor
> programmed above, and in the "second" descriptor programmed in
> stmmac_tso_allocator().
curr_addr = des + (total_len - tmp_len) is used in while loop in  tso_alloc to get address of all required descriptor . 
descriptor address will be updated finally in tso_alloc by below call .
 
if (priv->dma_cap.addr64 <= 32)
                                               desc->des0 = cpu_to_le32(curr_addr);
                               else
                                               stmmac_set_desc_addr(priv, desc, curr_addr);

 Also, since tmp_pay_len is decremented, but des
> isn't, it seems that stmmac_tso_allocator() would not put all of the
> buffer in the descriptors and would leave the last TSO_MAX_BUFF_SIZE
> bytes out?
> 
> I highlight all of this because with your change here we get the
> following now in the <= 32 case:
> 
>     first->des0 = des
>     first->des1 = des + proto_hdr_len
>     stmmac_tso_allocator(priv, des + proto_hdr_len, ...)
> 
> which is a subtle change in the call to stmmac_tso_allocator, meaning
> a subtle change in the descriptor programming.
> 
> Both seem wrong for the <= 32 case, but I'm "reading between the lines"
> with how these descriptors are programmed (I don't have the docs to back
> this up, I'm inferring from the code). It seems to me that in the <= 32
> case we should have:
> 
>     first->des0 = des
>     first->des1 = des + proto_hdr_len
>     stmmac_tso_allocator(priv, des + TSO_MAX_BUF_SIZE, ...)

let me check <=32 case only on setup and get back.
> 
> or similar depending on if that really makes sense with how des0/des1 is
> used (the handling is different in stmmac_tso_allocator() for <= 32,
> only des0 is used so I'm having a tough time figuring out how much of
> the des is actually programmed in des0 + des1 above without knowing the
> hardware better).
> 
> Does that make sense? The prior code seems fishy to me, your change
> seems to unintentionally change that fhsy part, but it still seems fishy
> to me. I don't think you should be changing that code's behavior in that
> patch, if you think it's right then we should continue with the current
> behavior prior to your patch, and if you think its wrong we should
> probably fix that *prior* to this patch in your series.
> 
> Thanks,
> Andrew
>
Andrew Halaney Sept. 24, 2024, 4:36 p.m. UTC | #8
On Tue, Sep 24, 2024 at 04:36:59PM GMT, Sarosh Hasan wrote:
> 
> 
> On 9/10/2024 7:34 PM, Andrew Halaney wrote:
> > Hey Suraj,
> > 
> > Your email client didn't seem to quote my response in your latest reply,
> > so its difficult to figure out what you're writing vs me below. It also
> > seems to have messed with the line breaks so I'm manually redoing those.
> > 
> > Please see if you can figure out how to make that happen for further
> > replies!
> > 
> > More comments below...
> > 
> > On Tue, Sep 10, 2024 at 12:47:08PM GMT, Suraj Jaiswal wrote:
> >>
> >>
> >> -----Original Message-----
> >> From: Andrew Halaney <ahalaney@redhat.com> 
> >> Sent: Wednesday, September 4, 2024 3:47 AM
> >> To: Suraj Jaiswal (QUIC) <quic_jsuraj@quicinc.com>
> >> Cc: Vinod Koul <vkoul@kernel.org>; bhupesh.sharma@linaro.org; Andy Gross <agross@kernel.org>; Bjorn Andersson <andersson@kernel.org>; Konrad Dybcio <konrad.dybcio@linaro.org>; David S. Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Rob Herring <robh+dt@kernel.org>; Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>; Conor Dooley <conor+dt@kernel.org>; Alexandre Torgue <alexandre.torgue@foss.st.com>; Jose Abreu <joabreu@synopsys.com>; Maxime Coquelin <mcoquelin.stm32@gmail.com>; netdev@vger.kernel.org; linux-arm-msm@vger.kernel.org; devicetree@vger.kernel.org; linux-kernel@vger.kernel.org; linux-stm32@st-md-mailman.stormreply.com; Prasad Sodagudi <psodagud@quicinc.com>; Rob Herring <robh@kernel.org>; kernel <kernel@quicinc.com>
> >> Subject: Re: [PATCH net] net: stmmac: Stop using a single dma_map() for multiple descriptors
> >>
> >> WARNING: This email originated from outside of Qualcomm. Please be wary of any links or attachments, and do not enable macros.
> >>
> >> On Mon, Sep 02, 2024 at 03:24:36PM GMT, Suraj Jaiswal wrote:
> >>> Currently same page address is shared
> >>> between multiple buffer addresses and causing smmu fault for other 
> >>> descriptor if address hold by one descriptor got cleaned.
> >>> Allocate separate buffer address for each descriptor for TSO path so 
> >>> that if one descriptor cleared it should not clean other descriptor 
> >>> address.
> > 
> > snip...
> > 
> >>>
> >>>  static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int 
> >>> queue) @@ -4351,25 +4380,17 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>               pay_len = 0;
> >>>       }
> >>>
> >>> -     stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
> >>> +     if (stmmac_tso_allocator(priv, (skb->data + proto_hdr_len),
> >>> +                              tmp_pay_len, nfrags == 0, queue, false))
> >>> +             goto dma_map_err;
> >>
> >> Changing the second argument here is subtly changing the dma_cap.addr64 <= 32
> >> case right before this. Is that intentional?
> >>
> >> i.e., prior, pretend des = 0 (side note but des is a very confusing variable
> >> name for "dma address" when there's also mentions of desc meaning "descriptor"
> >> in the DMA ring). In the <= 32 case, we'd call stmmac_tso_allocator(priv, 0)
> >> and in the else case we'd call stmmac_tso_allocator(priv, 0 + proto_hdr_len).
> >>
> >> With this change in both cases its called with the (not-yet-dma-mapped)
> >> skb->data + proto_hdr_len always (i.e. like the else case).
> >>
> >> Honestly, the <= 32 case reads weird to me without this patch. It seems some
> >> of the buffer is filled but des is not properly incremented?
> >>
> >> I don't know how this hardware is supposed to be programmed (no databook
> >> access) but that seems fishy (and like a separate bug, which would be nice to
> >> squash if so in its own patch). Would you be able to explain the logic there
> >> to me if it does make sense to you?
> >>
> > 
> >> <Suraj> des can not be 0 . des 0 means dma_map_single() failed and it will return.
> >> If we see if des calculation (first->des1 = cpu_to_le32(des + proto_hdr_len);)
> >> and else case des calculator ( des += proto_hdr_len;) it is adding proto_hdr_len
> >> to the memory that we after mapping skb->data using dma_map_single.
> >> Same way we added proto_hdr_len in second argument . 
> > 
> > 
> > 0 was just an example, and a confusing one, sorry. Let me paste the original
> > fishy code that I think you've modified the behavior for. Here's the
> > original:
> > 
> > 	if (priv->dma_cap.addr64 <= 32) {
> > 		first->des0 = cpu_to_le32(des);
> > 
> > 		/* Fill start of payload in buff2 of first descriptor */
> > 		if (pay_len)
> > 			first->des1 = cpu_to_le32(des + proto_hdr_len);
> > 
> > 		/* If needed take extra descriptors to fill the remaining payload */
> > 		tmp_pay_len = pay_len - TSO_MAX_BUFF_SIZE;
> > 	} else {
> > 		stmmac_set_desc_addr(priv, first, des);
> > 		tmp_pay_len = pay_len;
> > 		des += proto_hdr_len;
> > 		pay_len = 0;
> > 	}
> > 
> > 	stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
> > 
> > Imagine the <= 32 case. Let's say des is address 0 (just for simplicity
> > sake, let's assume that's valid). That means:
> > 
> >     first->des0 = des;
> >     first->des1 = des + proto_hdr_len;
> >     stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue)
> > 
> >     if des is 0, proto_hdr_len is 64, then that means
> > 
> >     first->des0 = 0
> >     first->des1 = 64
> >     stmmac_tso_allocator(priv, 0, tmp_pay_len, (nfrags == 0), queue)
> > 
> > That seems fishy to me. We setup up the first descriptor with the
> > beginning of des, and then the code goes and sets up more descriptors
> > (stmmac_tso_allocator()) starting with the same des again?
> tso_alloc is checking if more descriptor needed for packet . it is adding offset to get next
> descriptor (curr_addr = des + (total_len - tmp_len)) and storing in des of next descriptor.

Yes, so in stmmac_tso_allocator() we currently have:

	static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
					 int total_len, bool last_segment, u32 queue)
	{
		struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue];
		struct dma_desc *desc;
		u32 buff_size;
		int tmp_len;

		tmp_len = total_len;

		while (tmp_len > 0) {
			dma_addr_t curr_addr;

			tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx,
							priv->dma_conf.dma_tx_size);
			...
			curr_addr = des + (total_len - tmp_len);
			if (priv->dma_cap.addr64 <= 32)
				desc->des0 = cpu_to_le32(curr_addr);

so on the first loop you've got:
	tmp_len = total_len
	...
	curr_addr = des + total_len - temp_len
	i.e.
	curr_addr = des
meaning with the "first" handling I've highlighted we've got
	first->des0 = des
	"next"->des0 = des

where "next" is the cur_tx descriptor in the first loop of
stmmac_tso_allocator() (essentially the second descriptor).
That seems broken to me, and was that way prior to this patch.

You've modified the behavior in this patch unintentionally. I think it
needs modifying, but it should be done so explicitly in its own patch
prior to this one. I also think the current modification in this patch
isn't a fix. See prior reply below where I highlighted the programming as I
understand it with this patch applied, which would result in something
like.

first->des0 = des
first->des1 = des + proto_hdr_len
"next"->des0 = des + proto_hdr_len

Which again seems wrong, two descriptors pointing to the same address
isn't making sense to me.

Sorry to sound like a broken record, but I want to make sure we're on
the same page! Sounds like you're looking into it based on the below
comment, but some of these comments here made me think I didn't explain
the situation well enough.

> > 
> > Should we be adding the payload length (TSO_MAX_BUFF_SIZE I suppose
> > based on the tmp_pay_len = pay_len - TSO_MAX_BUFFSIZE above)? It seems
> > that <= 32 results in duplicate data in both the "first" descriptor
> > programmed above, and in the "second" descriptor programmed in
> > stmmac_tso_allocator().
> curr_addr = des + (total_len - tmp_len) is used in while loop in  tso_alloc to get address of all required descriptor . 
> descriptor address will be updated finally in tso_alloc by below call .
>  
> if (priv->dma_cap.addr64 <= 32)
>                                                desc->des0 = cpu_to_le32(curr_addr);
>                                else
>                                                stmmac_set_desc_addr(priv, desc, curr_addr);
> 
>  Also, since tmp_pay_len is decremented, but des
> > isn't, it seems that stmmac_tso_allocator() would not put all of the
> > buffer in the descriptors and would leave the last TSO_MAX_BUFF_SIZE
> > bytes out?
> > 
> > I highlight all of this because with your change here we get the
> > following now in the <= 32 case:
> > 
> >     first->des0 = des
> >     first->des1 = des + proto_hdr_len
> >     stmmac_tso_allocator(priv, des + proto_hdr_len, ...)
> > 
> > which is a subtle change in the call to stmmac_tso_allocator, meaning
> > a subtle change in the descriptor programming.
> > 
> > Both seem wrong for the <= 32 case, but I'm "reading between the lines"
> > with how these descriptors are programmed (I don't have the docs to back
> > this up, I'm inferring from the code). It seems to me that in the <= 32
> > case we should have:
> > 
> >     first->des0 = des
> >     first->des1 = des + proto_hdr_len
> >     stmmac_tso_allocator(priv, des + TSO_MAX_BUF_SIZE, ...)
> 
> let me check <=32 case only on setup and get back.
> > 
> > or similar depending on if that really makes sense with how des0/des1 is
> > used (the handling is different in stmmac_tso_allocator() for <= 32,
> > only des0 is used so I'm having a tough time figuring out how much of
> > the des is actually programmed in des0 + des1 above without knowing the
> > hardware better).
> > 
> > Does that make sense? The prior code seems fishy to me, your change
> > seems to unintentionally change that fhsy part, but it still seems fishy
> > to me. I don't think you should be changing that code's behavior in that
> > patch, if you think it's right then we should continue with the current
> > behavior prior to your patch, and if you think its wrong we should
> > probably fix that *prior* to this patch in your series.
> > 
> > Thanks,
> > Andrew
> > 
>
Sarosh Hasan Oct. 8, 2024, 12:15 p.m. UTC | #9
On 9/24/2024 10:06 PM, Andrew Halaney wrote:
> On Tue, Sep 24, 2024 at 04:36:59PM GMT, Sarosh Hasan wrote:
>>
>>
>> On 9/10/2024 7:34 PM, Andrew Halaney wrote:
>>> Hey Suraj,
>>>
>>> Your email client didn't seem to quote my response in your latest reply,
>>> so its difficult to figure out what you're writing vs me below. It also
>>> seems to have messed with the line breaks so I'm manually redoing those.
>>>
>>> Please see if you can figure out how to make that happen for further
>>> replies!
>>>
>>> More comments below...
>>>
>>> On Tue, Sep 10, 2024 at 12:47:08PM GMT, Suraj Jaiswal wrote:
>>>>
>>>>
>>>> -----Original Message-----
>>>> From: Andrew Halaney <ahalaney@redhat.com> 
>>>> Sent: Wednesday, September 4, 2024 3:47 AM
>>>> To: Suraj Jaiswal (QUIC) <quic_jsuraj@quicinc.com>
>>>> Cc: Vinod Koul <vkoul@kernel.org>; bhupesh.sharma@linaro.org; Andy Gross <agross@kernel.org>; Bjorn Andersson <andersson@kernel.org>; Konrad Dybcio <konrad.dybcio@linaro.org>; David S. Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Rob Herring <robh+dt@kernel.org>; Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>; Conor Dooley <conor+dt@kernel.org>; Alexandre Torgue <alexandre.torgue@foss.st.com>; Jose Abreu <joabreu@synopsys.com>; Maxime Coquelin <mcoquelin.stm32@gmail.com>; netdev@vger.kernel.org; linux-arm-msm@vger.kernel.org; devicetree@vger.kernel.org; linux-kernel@vger.kernel.org; linux-stm32@st-md-mailman.stormreply.com; Prasad Sodagudi <psodagud@quicinc.com>; Rob Herring <robh@kernel.org>; kernel <kernel@quicinc.com>
>>>> Subject: Re: [PATCH net] net: stmmac: Stop using a single dma_map() for multiple descriptors
>>>>
>>>> WARNING: This email originated from outside of Qualcomm. Please be wary of any links or attachments, and do not enable macros.
>>>>
>>>> On Mon, Sep 02, 2024 at 03:24:36PM GMT, Suraj Jaiswal wrote:
>>>>> Currently same page address is shared
>>>>> between multiple buffer addresses and causing smmu fault for other 
>>>>> descriptor if address hold by one descriptor got cleaned.
>>>>> Allocate separate buffer address for each descriptor for TSO path so 
>>>>> that if one descriptor cleared it should not clean other descriptor 
>>>>> address.
>>>
>>> snip...
>>>
>>>>>
>>>>>  static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int 
>>>>> queue) @@ -4351,25 +4380,17 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>>               pay_len = 0;
>>>>>       }
>>>>>
>>>>> -     stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
>>>>> +     if (stmmac_tso_allocator(priv, (skb->data + proto_hdr_len),
>>>>> +                              tmp_pay_len, nfrags == 0, queue, false))
>>>>> +             goto dma_map_err;
>>>>
>>>> Changing the second argument here is subtly changing the dma_cap.addr64 <= 32
>>>> case right before this. Is that intentional?
>>>>
>>>> i.e., prior, pretend des = 0 (side note but des is a very confusing variable
>>>> name for "dma address" when there's also mentions of desc meaning "descriptor"
>>>> in the DMA ring). In the <= 32 case, we'd call stmmac_tso_allocator(priv, 0)
>>>> and in the else case we'd call stmmac_tso_allocator(priv, 0 + proto_hdr_len).
>>>>
>>>> With this change in both cases its called with the (not-yet-dma-mapped)
>>>> skb->data + proto_hdr_len always (i.e. like the else case).
>>>>
>>>> Honestly, the <= 32 case reads weird to me without this patch. It seems some
>>>> of the buffer is filled but des is not properly incremented?
>>>>
>>>> I don't know how this hardware is supposed to be programmed (no databook
>>>> access) but that seems fishy (and like a separate bug, which would be nice to
>>>> squash if so in its own patch). Would you be able to explain the logic there
>>>> to me if it does make sense to you?
>>>>
>>>
>>>> <Suraj> des can not be 0 . des 0 means dma_map_single() failed and it will return.
>>>> If we see if des calculation (first->des1 = cpu_to_le32(des + proto_hdr_len);)
>>>> and else case des calculator ( des += proto_hdr_len;) it is adding proto_hdr_len
>>>> to the memory that we after mapping skb->data using dma_map_single.
>>>> Same way we added proto_hdr_len in second argument . 
>>>
>>>
>>> 0 was just an example, and a confusing one, sorry. Let me paste the original
>>> fishy code that I think you've modified the behavior for. Here's the
>>> original:
>>>
>>> 	if (priv->dma_cap.addr64 <= 32) {
>>> 		first->des0 = cpu_to_le32(des);
>>>
>>> 		/* Fill start of payload in buff2 of first descriptor */
>>> 		if (pay_len)
>>> 			first->des1 = cpu_to_le32(des + proto_hdr_len);
>>>
>>> 		/* If needed take extra descriptors to fill the remaining payload */
>>> 		tmp_pay_len = pay_len - TSO_MAX_BUFF_SIZE;
>>> 	} else {
>>> 		stmmac_set_desc_addr(priv, first, des);
>>> 		tmp_pay_len = pay_len;
>>> 		des += proto_hdr_len;
>>> 		pay_len = 0;
>>> 	}
>>>
>>> 	stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
>>>
>>> Imagine the <= 32 case. Let's say des is address 0 (just for simplicity
>>> sake, let's assume that's valid). That means:
>>>
>>>     first->des0 = des;
>>>     first->des1 = des + proto_hdr_len;
>>>     stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue)
>>>
>>>     if des is 0, proto_hdr_len is 64, then that means
>>>
>>>     first->des0 = 0
>>>     first->des1 = 64
>>>     stmmac_tso_allocator(priv, 0, tmp_pay_len, (nfrags == 0), queue)
>>>
>>> That seems fishy to me. We setup up the first descriptor with the
>>> beginning of des, and then the code goes and sets up more descriptors
>>> (stmmac_tso_allocator()) starting with the same des again?
>> tso_alloc is checking if more descriptor needed for packet . it is adding offset to get next
>> descriptor (curr_addr = des + (total_len - tmp_len)) and storing in des of next descriptor.
> 
> Yes, so in stmmac_tso_allocator() we currently have:
> 
> 	static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
> 					 int total_len, bool last_segment, u32 queue)
> 	{
> 		struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue];
> 		struct dma_desc *desc;
> 		u32 buff_size;
> 		int tmp_len;
> 
> 		tmp_len = total_len;
> 
> 		while (tmp_len > 0) {
> 			dma_addr_t curr_addr;
> 
> 			tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx,
> 							priv->dma_conf.dma_tx_size);
> 			...
> 			curr_addr = des + (total_len - tmp_len);
> 			if (priv->dma_cap.addr64 <= 32)
> 				desc->des0 = cpu_to_le32(curr_addr);
> 
> so on the first loop you've got:
> 	tmp_len = total_len
> 	...
> 	curr_addr = des + total_len - temp_len
> 	i.e.
> 	curr_addr = des
> meaning with the "first" handling I've highlighted we've got
> 	first->des0 = des
> 	"next"->des0 = des
> 
> where "next" is the cur_tx descriptor in the first loop of
> stmmac_tso_allocator() (essentially the second descriptor).
> That seems broken to me, and was that way prior to this patch.
> 
able to verify for DMA mask < 32 and > 32 . We will update final patch by taking cares of others commnet
> You've modified the behavior in this patch unintentionally. I think it
> needs modifying, but it should be done so explicitly in its own patch
> prior to this one. I also think the current modification in this patch
> isn't a fix. See prior reply below where I highlighted the programming as I
> understand it with this patch applied, which would result in something
> like.
> 
> first->des0 = des
> first->des1 = des + proto_hdr_len
> "next"->des0 = des + proto_hdr_len
> 
> Which again seems wrong, two descriptors pointing to the same address
> isn't making sense to me.
> 
> Sorry to sound like a broken record, but I want to make sure we're on
> the same page! Sounds like you're looking into it based on the below
> comment, but some of these comments here made me think I didn't explain
> the situation well enough.
> 
>>>
>>> Should we be adding the payload length (TSO_MAX_BUFF_SIZE I suppose
>>> based on the tmp_pay_len = pay_len - TSO_MAX_BUFFSIZE above)? It seems
>>> that <= 32 results in duplicate data in both the "first" descriptor
>>> programmed above, and in the "second" descriptor programmed in
>>> stmmac_tso_allocator().
>> curr_addr = des + (total_len - tmp_len) is used in while loop in  tso_alloc to get address of all required descriptor . 
>> descriptor address will be updated finally in tso_alloc by below call .
>>  
>> if (priv->dma_cap.addr64 <= 32)
>>                                                desc->des0 = cpu_to_le32(curr_addr);
>>                                else
>>                                                stmmac_set_desc_addr(priv, desc, curr_addr);
>>
>>  Also, since tmp_pay_len is decremented, but des
>>> isn't, it seems that stmmac_tso_allocator() would not put all of the
>>> buffer in the descriptors and would leave the last TSO_MAX_BUFF_SIZE
>>> bytes out?
>>>
>>> I highlight all of this because with your change here we get the
>>> following now in the <= 32 case:
>>>
>>>     first->des0 = des
>>>     first->des1 = des + proto_hdr_len
>>>     stmmac_tso_allocator(priv, des + proto_hdr_len, ...)
>>>
>>> which is a subtle change in the call to stmmac_tso_allocator, meaning
>>> a subtle change in the descriptor programming.
>>>
>>> Both seem wrong for the <= 32 case, but I'm "reading between the lines"
>>> with how these descriptors are programmed (I don't have the docs to back
>>> this up, I'm inferring from the code). It seems to me that in the <= 32
>>> case we should have:
>>>
>>>     first->des0 = des
>>>     first->des1 = des + proto_hdr_len
>>>     stmmac_tso_allocator(priv, des + TSO_MAX_BUF_SIZE, ...)
>>
>> let me check <=32 case only on setup and get back.
>>>
>>> or similar depending on if that really makes sense with how des0/des1 is
>>> used (the handling is different in stmmac_tso_allocator() for <= 32,
>>> only des0 is used so I'm having a tough time figuring out how much of
>>> the des is actually programmed in des0 + des1 above without knowing the
>>> hardware better).
>>>
>>> Does that make sense? The prior code seems fishy to me, your change
>>> seems to unintentionally change that fhsy part, but it still seems fishy
>>> to me. I don't think you should be changing that code's behavior in that
>>> patch, if you think it's right then we should continue with the current
>>> behavior prior to your patch, and if you think its wrong we should
>>> probably fix that *prior* to this patch in your series.
>>>
>>> Thanks,
>>> Andrew
>>>
>>
>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 83b654b7a9fd..5948774c403f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4136,16 +4136,18 @@  static bool stmmac_vlan_insert(struct stmmac_priv *priv, struct sk_buff *skb,
 /**
  *  stmmac_tso_allocator - close entry point of the driver
  *  @priv: driver private structure
- *  @des: buffer start address
+ *  @addr: Contains either skb frag address or skb->data address
  *  @total_len: total length to fill in descriptors
  *  @last_segment: condition for the last descriptor
  *  @queue: TX queue index
+ * @is_skb_frag: condition to check whether skb data is part of fragment or not
  *  Description:
  *  This function fills descriptor and request new descriptors according to
  *  buffer length to fill
+ *  This function returns 0 on success else -ERRNO on fail
  */
-static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
-				 int total_len, bool last_segment, u32 queue)
+static int stmmac_tso_allocator(struct stmmac_priv *priv, void *addr,
+				int total_len, bool last_segment, u32 queue, bool is_skb_frag)
 {
 	struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue];
 	struct dma_desc *desc;
@@ -4153,6 +4155,8 @@  static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
 	int tmp_len;
 
 	tmp_len = total_len;
+	unsigned int offset = 0;
+	unsigned char *data = addr;
 
 	while (tmp_len > 0) {
 		dma_addr_t curr_addr;
@@ -4161,20 +4165,44 @@  static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
 						priv->dma_conf.dma_tx_size);
 		WARN_ON(tx_q->tx_skbuff[tx_q->cur_tx]);
 
+		buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ? TSO_MAX_BUFF_SIZE : tmp_len;
+
 		if (tx_q->tbs & STMMAC_TBS_AVAIL)
 			desc = &tx_q->dma_entx[tx_q->cur_tx].basic;
 		else
 			desc = &tx_q->dma_tx[tx_q->cur_tx];
 
-		curr_addr = des + (total_len - tmp_len);
+		offset = total_len - tmp_len;
+		if (!is_skb_frag) {
+			curr_addr = dma_map_single(priv->device, data + offset, buff_size,
+						   DMA_TO_DEVICE);
+
+			if (dma_mapping_error(priv->device, curr_addr))
+				return -ENOMEM;
+
+			tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = curr_addr;
+			tx_q->tx_skbuff_dma[tx_q->cur_tx].len = buff_size;
+			tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = false;
+			tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
+		} else {
+			curr_addr = skb_frag_dma_map(priv->device, addr, offset,
+						     buff_size,
+						     DMA_TO_DEVICE);
+
+			if (dma_mapping_error(priv->device, curr_addr))
+				return -ENOMEM;
+
+			tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = curr_addr;
+			tx_q->tx_skbuff_dma[tx_q->cur_tx].len = buff_size;
+			tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
+			tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
+		}
+
 		if (priv->dma_cap.addr64 <= 32)
 			desc->des0 = cpu_to_le32(curr_addr);
 		else
 			stmmac_set_desc_addr(priv, desc, curr_addr);
 
-		buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ?
-			    TSO_MAX_BUFF_SIZE : tmp_len;
-
 		stmmac_prepare_tso_tx_desc(priv, desc, 0, buff_size,
 				0, 1,
 				(last_segment) && (tmp_len <= TSO_MAX_BUFF_SIZE),
@@ -4182,6 +4210,7 @@  static void stmmac_tso_allocator(struct stmmac_priv *priv, dma_addr_t des,
 
 		tmp_len -= TSO_MAX_BUFF_SIZE;
 	}
+	return 0;
 }
 
 static void stmmac_flush_tx_descriptors(struct stmmac_priv *priv, int queue)
@@ -4351,25 +4380,17 @@  static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 		pay_len = 0;
 	}
 
-	stmmac_tso_allocator(priv, des, tmp_pay_len, (nfrags == 0), queue);
+	if (stmmac_tso_allocator(priv, (skb->data + proto_hdr_len),
+				 tmp_pay_len, nfrags == 0, queue, false))
+		goto dma_map_err;
 
 	/* Prepare fragments */
 	for (i = 0; i < nfrags; i++) {
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
-		des = skb_frag_dma_map(priv->device, frag, 0,
-				       skb_frag_size(frag),
-				       DMA_TO_DEVICE);
-		if (dma_mapping_error(priv->device, des))
+		if (stmmac_tso_allocator(priv, frag, skb_frag_size(frag),
+					 (i == nfrags - 1), queue, true))
 			goto dma_map_err;
-
-		stmmac_tso_allocator(priv, des, skb_frag_size(frag),
-				     (i == nfrags - 1), queue);
-
-		tx_q->tx_skbuff_dma[tx_q->cur_tx].buf = des;
-		tx_q->tx_skbuff_dma[tx_q->cur_tx].len = skb_frag_size(frag);
-		tx_q->tx_skbuff_dma[tx_q->cur_tx].map_as_page = true;
-		tx_q->tx_skbuff_dma[tx_q->cur_tx].buf_type = STMMAC_TXBUF_T_SKB;
 	}
 
 	tx_q->tx_skbuff_dma[tx_q->cur_tx].last_segment = true;