diff mbox

[V2] net: ethernet: mellanox: correct page conversion

Message ID 1460845412-13120-1-git-send-email-okaya@codeaurora.org (mailing list archive)
State Not Applicable
Headers show

Commit Message

Sinan Kaya April 16, 2016, 10:23 p.m. UTC
Current code is assuming that the address returned by dma_alloc_coherent
is a logical address. This is not true on ARM/ARM64 systems. This patch
replaces dma_alloc_coherent with dma_map_page API. The address returned
can later by virtually mapped from the CPU side with vmap API.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/mellanox/mlx4/alloc.c | 37 ++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 10 deletions(-)

Comments

David Miller April 18, 2016, 4 a.m. UTC | #1
From: Sinan Kaya <okaya@codeaurora.org>
Date: Sat, 16 Apr 2016 18:23:32 -0400

> Current code is assuming that the address returned by dma_alloc_coherent
> is a logical address. This is not true on ARM/ARM64 systems. This patch
> replaces dma_alloc_coherent with dma_map_page API. The address returned
> can later by virtually mapped from the CPU side with vmap API.
> 
> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>

You can't do this.

The DMA map page API gives non-coherent mappings, and thus requires
proper flushing.

So a straight conversion like this is never legitimate.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sinan Kaya April 18, 2016, 5:06 a.m. UTC | #2
On 2016-04-18 00:00, David Miller wrote:
> From: Sinan Kaya <okaya@codeaurora.org>
> Date: Sat, 16 Apr 2016 18:23:32 -0400
> 
>> Current code is assuming that the address returned by 
>> dma_alloc_coherent
>> is a logical address. This is not true on ARM/ARM64 systems. This 
>> patch
>> replaces dma_alloc_coherent with dma_map_page API. The address 
>> returned
>> can later by virtually mapped from the CPU side with vmap API.
>> 
>> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
> 
> You can't do this.
> 
> The DMA map page API gives non-coherent mappings, and thus requires
> proper flushing.
> 
> So a straight conversion like this is never legitimate.

I would agree on proper dma api usage. However, the code is already 
assuming coherent architecture by mapping the cpu pages as page_kernel.

Dma_map_page returns cached buffers and you don't need cache flushes on 
coherent architecture to make the data visible.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eli Cohen April 18, 2016, 6:54 a.m. UTC | #3
Sinan,

if we get rid of the part this code:

                if (BITS_PER_LONG == 64) {
                        struct page **pages;
                        pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
                        if (!pages)
                                goto err_free;
                        ...
                        ...
                        if (!buf->direct.buf)
                                goto err_free;
                }

Does that solve the arm issue?

The other parts of the driver using the allocated buffer can still
work with list of coherent chunks.

On Sat, Apr 16, 2016 at 06:23:32PM -0400, Sinan Kaya wrote:
> Current code is assuming that the address returned by dma_alloc_coherent
> is a logical address. This is not true on ARM/ARM64 systems. This patch
> replaces dma_alloc_coherent with dma_map_page API. The address returned
> can later by virtually mapped from the CPU side with vmap API.
> 
> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
> ---
>  drivers/net/ethernet/mellanox/mlx4/alloc.c | 37 ++++++++++++++++++++++--------
>  1 file changed, 27 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
> index 0c51c69..22a7ae7 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
> @@ -618,13 +618,26 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
>  			return -ENOMEM;
>  
>  		for (i = 0; i < buf->nbufs; ++i) {
> -			buf->page_list[i].buf =
> -				dma_alloc_coherent(&dev->persist->pdev->dev,
> -						   PAGE_SIZE,
> -						   &t, gfp);
> -			if (!buf->page_list[i].buf)
> +			struct page *page;
> +
> +			page = alloc_page(GFP_KERNEL);
> +			if (!page)
>  				goto err_free;
>  
> +			t = dma_map_page(&dev->persist->pdev->dev, page, 0,
> +					 PAGE_SIZE, DMA_BIDIRECTIONAL);
> +
> +			if (dma_mapping_error(&dev->persist->pdev->dev, t)) {
> +				__free_page(page);
> +				goto err_free;
> +			}
> +
> +			buf->page_list[i].buf = page_address(page);
> +			if (!buf->page_list[i].buf) {
> +				__free_page(page);
> +				goto err_free;
> +			}
> +
>  			buf->page_list[i].map = t;
>  
>  			memset(buf->page_list[i].buf, 0, PAGE_SIZE);
> @@ -666,11 +679,15 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
>  			vunmap(buf->direct.buf);
>  
>  		for (i = 0; i < buf->nbufs; ++i)
> -			if (buf->page_list[i].buf)
> -				dma_free_coherent(&dev->persist->pdev->dev,
> -						  PAGE_SIZE,
> -						  buf->page_list[i].buf,
> -						  buf->page_list[i].map);
> +			if (buf->page_list[i].buf) {
> +				struct page *page;
> +
> +				page = virt_to_page(buf->page_list[i].buf);
> +				dma_unmap_page(&dev->persist->pdev->dev,
> +					       buf->page_list[i].map,
> +					       PAGE_SIZE, DMA_BIDIRECTIONAL);
> +				__free_page(page);
> +			}
>  		kfree(buf->page_list);
>  	}
>  }
> -- 
> 1.8.2.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
hch@infradead.org April 18, 2016, 12:12 p.m. UTC | #4
On Sat, Apr 16, 2016 at 06:23:32PM -0400, Sinan Kaya wrote:
> Current code is assuming that the address returned by dma_alloc_coherent
> is a logical address. This is not true on ARM/ARM64 systems.

Can you explain what you mean with a 'logical address' and what actual
issue you're trying to solve?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sinan Kaya April 18, 2016, 1:06 p.m. UTC | #5
On 2016-04-18 08:12, Christoph Hellwig wrote:
> On Sat, Apr 16, 2016 at 06:23:32PM -0400, Sinan Kaya wrote:
>> Current code is assuming that the address returned by 
>> dma_alloc_coherent
>> is a logical address. This is not true on ARM/ARM64 systems.
> 
> Can you explain what you mean with a 'logical address' and what actual
> issue you're trying to solve?

Vmap call is failing on arm64 systems because dma alloc api already 
returns an address mapped with vmap.

Please see arch/arm64/mm directory.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
hch@infradead.org April 18, 2016, 1:10 p.m. UTC | #6
On Mon, Apr 18, 2016 at 09:06:18AM -0400, okaya@codeaurora.org wrote:
> On 2016-04-18 08:12, Christoph Hellwig wrote:
> >On Sat, Apr 16, 2016 at 06:23:32PM -0400, Sinan Kaya wrote:
> >>Current code is assuming that the address returned by dma_alloc_coherent
> >>is a logical address. This is not true on ARM/ARM64 systems.
> >
> >Can you explain what you mean with a 'logical address' and what actual
> >issue you're trying to solve?
> 
> Vmap call is failing on arm64 systems because dma alloc api already returns
> an address mapped with vmap.

Please state your problem clearly.  What I'm reverse engineering from
your posts is:  because dma_alloc_coherent uses vmap-like mappings on
arm64 (all, some systems?) a driver using a lot of them might run into
limits of the vmap pool size.

Is this correct?

> 
> Please see arch/arm64/mm directory.
---end quoted text---
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sinan Kaya April 18, 2016, 1:49 p.m. UTC | #7
On 4/18/2016 9:10 AM, Christoph Hellwig wrote:
> On Mon, Apr 18, 2016 at 09:06:18AM -0400, okaya@codeaurora.org wrote:
>> On 2016-04-18 08:12, Christoph Hellwig wrote:
>>> On Sat, Apr 16, 2016 at 06:23:32PM -0400, Sinan Kaya wrote:
>>>> Current code is assuming that the address returned by dma_alloc_coherent
>>>> is a logical address. This is not true on ARM/ARM64 systems.
>>>
>>> Can you explain what you mean with a 'logical address' and what actual
>>> issue you're trying to solve?
>>
Here is a good description of logical address vs. virtual address.

https://www.quora.com/What-is-the-Kernel-logical-and-virtual-addresses-What-is-the-difference-between-them-What-is-the-type-of-addresses-listed-in-the-System-map


>> Vmap call is failing on arm64 systems because dma alloc api already returns
>> an address mapped with vmap.
> 
> Please state your problem clearly.  What I'm reverse engineering from
> your posts is:  because dma_alloc_coherent uses vmap-like mappings on
> arm64 (all, some systems?) 

All arm64 systems. 

>a driver using a lot of them might run into
> limits of the vmap pool size.
> 
> Is this correct?
> 
No, the driver is plain broken without this patch. It causes a kernel panic 
during driver probe.

This is the definition of vmap API.

https://www.kernel.org/doc/htmldocs/kernel-api/API-vmap.html

VMAP allows you to make several pages look contiguous to the CPU. 
It can only be used against logical addresses returned from kmalloc 
or alloc_page. 

You cannot take several virtually mapped addresses returned by dma_alloc_coherent
and try to make them virtually contiguous again. 

The code happens to work on other architectures by pure luck. AFAIK, dma_alloc_coherent
returns logical addresses on Intel systems until it runs out of DMA memory. After 
that intel arch will also start returning virtually mapped addresses and this code
will also fail. ARM64 on the other hand always returns a virtually mapped address.

The goal of this code is to allocate a bunch of page sized memory and make it look
contiguous. It is just using the wrong API. The correct API is either kmalloc or
alloc_page map it with dma_map_page not dma_alloc_coherent.

The proper usage of dma_map_page requires code to call dma_sync API in correct
places to be compatible with noncoherent systems. This code is already assuming
coherency. It would be a nice to have dma_sync APIs in right places. There is no
harm in calling dma_sync API for coherent systems as they are no-ops in DMA mapping
layer whereas it is a cache flush for non-coherent systems.

>>
>> Please see arch/arm64/mm directory.
> ---end quoted text---
> 

I hope it is clear now. The previous email was the most I could type on my phone.
Sinan Kaya April 18, 2016, 1:53 p.m. UTC | #8
On 4/18/2016 2:54 AM, Eli Cohen wrote:
> Sinan,
> 
> if we get rid of the part this code:
> 
>                 if (BITS_PER_LONG == 64) {
>                         struct page **pages;
>                         pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
>                         if (!pages)
>                                 goto err_free;
>                         ...
>                         ...
>                         if (!buf->direct.buf)
>                                 goto err_free;
>                 }
> 
> Does that solve the arm issue?

I will test. As far as I know, there is one more place these DMA addresses
are called with vmap. This is in mlx4_en_map_buffer.

I was trying to rearrange the allocation so that vmap actually works.

What do you think about mlx4_en_map_buffer?
hch@infradead.org April 18, 2016, 1:59 p.m. UTC | #9
On Mon, Apr 18, 2016 at 09:49:10AM -0400, Sinan Kaya wrote:
> Here is a good description of logical address vs. virtual address.
> 
> https://www.quora.com/What-is-the-Kernel-logical-and-virtual-addresses-What-is-the-difference-between-them-What-is-the-type-of-addresses-listed-in-the-System-map

That's not how we use the terms in Linux.  But it's not really the point
of my question either.

> > Is this correct?
> > 
> No, the driver is plain broken without this patch. It causes a kernel panic 
> during driver probe.
> 
> This is the definition of vmap API.
> 
> https://www.kernel.org/doc/htmldocs/kernel-api/API-vmap.html

Thanks for the pointer, but I'm actually the person who introduced vmap
to Linux a long time ago, and this is once again not my question.

> You cannot take several virtually mapped addresses returned by dma_alloc_coherent
> and try to make them virtually contiguous again. 

But now we're getting closer to the issue: the mlx4_en driver is using
vmap on buffers allocated using dma_alloc_coherent if on a 64-bit
architecture, and that's obviously broken.

Now the big quetions is: why does it do that, given that
dma_alloc_coherent can be used for high order allocations anyway (and in
fact many architectures implement is using a version of vmap).

Let's get some answers on these question from the Mellanox folks and
work from there.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eli Cohen April 18, 2016, 2:05 p.m. UTC | #10
Sure, this is not the complete patch. As far as I know the problem
you're facing with arm is that virt_to_page() does not provide the
correct page descriptor so my suggestion will eliminate the need for
it.

On Mon, Apr 18, 2016 at 09:53:30AM -0400, Sinan Kaya wrote:
> On 4/18/2016 2:54 AM, Eli Cohen wrote:
> > Sinan,
> > 
> > if we get rid of the part this code:
> > 
> >                 if (BITS_PER_LONG == 64) {
> >                         struct page **pages;
> >                         pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
> >                         if (!pages)
> >                                 goto err_free;
> >                         ...
> >                         ...
> >                         if (!buf->direct.buf)
> >                                 goto err_free;
> >                 }
> > 
> > Does that solve the arm issue?
> 
> I will test. As far as I know, there is one more place these DMA addresses
> are called with vmap. This is in mlx4_en_map_buffer.
> 
> I was trying to rearrange the allocation so that vmap actually works.
> 
> What do you think about mlx4_en_map_buffer?
> 
> 
> -- 
> Sinan Kaya
> Qualcomm Technologies, Inc. on behalf of Qualcomm Innovation Center, Inc.
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
hch@infradead.org April 18, 2016, 2:32 p.m. UTC | #11
On Mon, Apr 18, 2016 at 09:54:47AM +0300, Eli Cohen wrote:
> Sinan,
> 
> if we get rid of the part this code:
> 
>                 if (BITS_PER_LONG == 64) {
>                         struct page **pages;
>                         pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
>                         if (!pages)
>                                 goto err_free;
>                         ...
>                         ...
>                         if (!buf->direct.buf)
>                                 goto err_free;
>                 }
> 
> Does that solve the arm issue?

Not quite.  You still have code in mlx4_en_map_buffer that performs
this mapping later if it it wasn't mapped in mlx4_buf_alloc.

You'll need to get rid of that by ensuring max_direct for all the cases
currently using mlx4_en_map_buffer as well.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eli Cohen April 18, 2016, 2:39 p.m. UTC | #12
Right, I did not suggest this as a patch but just wanted to pinpoint the problematic issue which is that virt_to_page does not give you the correct pointer to the page.

-----Original Message-----
From: Christoph Hellwig [mailto:hch@infradead.org] 
Sent: Monday, April 18, 2016 9:33 AM
To: Eli Cohen <eli@mellanox.com>
Cc: Sinan Kaya <okaya@codeaurora.org>; linux-rdma@vger.kernel.org; timur@codeaurora.org; cov@codeaurora.org; Yishai Hadas <yishaih@mellanox.com>; netdev@vger.kernel.org; linux-kernel@vger.kernel.org
Subject: Re: [PATCH V2] net: ethernet: mellanox: correct page conversion

On Mon, Apr 18, 2016 at 09:54:47AM +0300, Eli Cohen wrote:
> Sinan,
> 
> if we get rid of the part this code:
> 
>                 if (BITS_PER_LONG == 64) {
>                         struct page **pages;
>                         pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
>                         if (!pages)
>                                 goto err_free;
>                         ...
>                         ...
>                         if (!buf->direct.buf)
>                                 goto err_free;
>                 }
> 
> Does that solve the arm issue?

Not quite.  You still have code in mlx4_en_map_buffer that performs this mapping later if it it wasn't mapped in mlx4_buf_alloc.

You'll need to get rid of that by ensuring max_direct for all the cases currently using mlx4_en_map_buffer as well.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Timur Tabi April 18, 2016, 3:15 p.m. UTC | #13
Sinan Kaya wrote:
>
> VMAP allows you to make several pages look contiguous to the CPU.
> It can only be used against logical addresses returned from kmalloc
> or alloc_page.
>
> You cannot take several virtually mapped addresses returned by dma_alloc_coherent
> and try to make them virtually contiguous again.
>
> The code happens to work on other architectures by pure luck. AFAIK, dma_alloc_coherent
> returns logical addresses on Intel systems until it runs out of DMA memory. After
> that intel arch will also start returning virtually mapped addresses and this code
> will also fail. ARM64 on the other hand always returns a virtually mapped address.
>
> The goal of this code is to allocate a bunch of page sized memory and make it look
> contiguous. It is just using the wrong API. The correct API is either kmalloc or
> alloc_page map it with dma_map_page not dma_alloc_coherent.
>
> The proper usage of dma_map_page requires code to call dma_sync API in correct
> places to be compatible with noncoherent systems. This code is already assuming
> coherency. It would be a nice to have dma_sync APIs in right places. There is no
> harm in calling dma_sync API for coherent systems as they are no-ops in DMA mapping
> layer whereas it is a cache flush for non-coherent systems.

The text would be a great addition to the patch description.
hch@infradead.org April 18, 2016, 3:17 p.m. UTC | #14
On Mon, Apr 18, 2016 at 02:39:36PM +0000, Eli Cohen wrote:
> Right, I did not suggest this as a patch but just wanted to pinpoint the problematic issue which is that virt_to_page does not give you the correct pointer to the page.

Removing both the virt_to_page + vmap calls would solve the issue
indeed.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sinan Kaya April 18, 2016, 3:21 p.m. UTC | #15
On 4/18/2016 11:17 AM, Christoph Hellwig wrote:
> On Mon, Apr 18, 2016 at 02:39:36PM +0000, Eli Cohen wrote:
>> Right, I did not suggest this as a patch but just wanted to pinpoint the problematic issue which is that virt_to_page does not give you the correct pointer to the page.
> 
> Removing both the virt_to_page + vmap calls would solve the issue
> indeed.
> 

I was looking at the code. I don't see how removing virt_to_page + vmap 
would solve the issue.

The code is trying to access the buffer space with direct.buf member
from the CPU side. This member would become NULL, when this code is 
removed and also in mlx4_en_map_buffer. 


                if (BITS_PER_LONG == 64) {
                        struct page **pages;
                        pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
                        if (!pages)
                                goto err_free;
                        ...
                        ...
                        if (!buf->direct.buf)
                                goto err_free;
                }

drivers/net/ethernet/mellanox/mlx4/en_tx.c (2 hits)
Line 110: ring->buf = ring->wqres.buf.direct.buf;
Line 114: (unsigned long long) ring->wqres.buf.direct.map);

drivers/net/ethernet/mellanox/mlx4/en_rx.c (1 hit)
Line 404: ring->buf = ring->wqres.buf.direct.buf;

drivers/net/ethernet/mellanox/mlx4/en_cq.c (1 hit)
Line 85: cq->buf = (struct mlx4_cqe *)cq->wqres.buf.direct.buf;

What am I missing?
Sinan Kaya April 18, 2016, 3:22 p.m. UTC | #16
On 4/18/2016 11:15 AM, Timur Tabi wrote:
> Sinan Kaya wrote:
>>
>> VMAP allows you to make several pages look contiguous to the CPU.
>> It can only be used against logical addresses returned from kmalloc
>> or alloc_page.
>>
>> You cannot take several virtually mapped addresses returned by dma_alloc_coherent
>> and try to make them virtually contiguous again.
>>
>> The code happens to work on other architectures by pure luck. AFAIK, dma_alloc_coherent
>> returns logical addresses on Intel systems until it runs out of DMA memory. After
>> that intel arch will also start returning virtually mapped addresses and this code
>> will also fail. ARM64 on the other hand always returns a virtually mapped address.
>>
>> The goal of this code is to allocate a bunch of page sized memory and make it look
>> contiguous. It is just using the wrong API. The correct API is either kmalloc or
>> alloc_page map it with dma_map_page not dma_alloc_coherent.
>>
>> The proper usage of dma_map_page requires code to call dma_sync API in correct
>> places to be compatible with noncoherent systems. This code is already assuming
>> coherency. It would be a nice to have dma_sync APIs in right places. There is no
>> harm in calling dma_sync API for coherent systems as they are no-ops in DMA mapping
>> layer whereas it is a cache flush for non-coherent systems.
> 
> The text would be a great addition to the patch description.
> 

I can do that on the next version.
hch@infradead.org April 18, 2016, 3:40 p.m. UTC | #17
On Mon, Apr 18, 2016 at 11:21:12AM -0400, Sinan Kaya wrote:
> I was looking at the code. I don't see how removing virt_to_page + vmap 
> would solve the issue.
> 
> The code is trying to access the buffer space with direct.buf member
> from the CPU side. This member would become NULL, when this code is 
> removed and also in mlx4_en_map_buffer. 
>
> ...
>
> What am I missing?

As mentioned before you'll also need to enforce you hit the nbufs = 1
case for these.  In fact most callers should simply switch to a plain
dma_zalloc_coherent call without all these wrappers.  If we have a case
where we really want multiple buffers that don't have to be contiguous
(maybe the MTT case) I'd rather opencode that instead of building this
confusing interface on top of it.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 18, 2016, 3:59 p.m. UTC | #18
From: okaya@codeaurora.org
Date: Mon, 18 Apr 2016 01:06:27 -0400

> On 2016-04-18 00:00, David Miller wrote:
>> From: Sinan Kaya <okaya@codeaurora.org>
>> Date: Sat, 16 Apr 2016 18:23:32 -0400
>> 
>>> Current code is assuming that the address returned by
>>> dma_alloc_coherent
>>> is a logical address. This is not true on ARM/ARM64 systems. This
>>> patch
>>> replaces dma_alloc_coherent with dma_map_page API. The address
>>> returned
>>> can later by virtually mapped from the CPU side with vmap API.
>>> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
>> You can't do this.
>> The DMA map page API gives non-coherent mappings, and thus requires
>> proper flushing.
>> So a straight conversion like this is never legitimate.
> 
> I would agree on proper dma api usage. However, the code is already
> assuming coherent architecture by mapping the cpu pages as
> page_kernel.
> 
> Dma_map_page returns cached buffers and you don't need cache flushes
> on coherent architecture to make the data visible.

All you are telling me is that there are two bugs instead of one, so now
both need to be fixed.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sinan Kaya April 18, 2016, 4:06 p.m. UTC | #19
On 4/18/2016 11:59 AM, David Miller wrote:
> From: okaya@codeaurora.org
> Date: Mon, 18 Apr 2016 01:06:27 -0400
> 
>> On 2016-04-18 00:00, David Miller wrote:
>>> From: Sinan Kaya <okaya@codeaurora.org>
>>> Date: Sat, 16 Apr 2016 18:23:32 -0400
>>>
>>>> Current code is assuming that the address returned by
>>>> dma_alloc_coherent
>>>> is a logical address. This is not true on ARM/ARM64 systems. This
>>>> patch
>>>> replaces dma_alloc_coherent with dma_map_page API. The address
>>>> returned
>>>> can later by virtually mapped from the CPU side with vmap API.
>>>> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
>>> You can't do this.
>>> The DMA map page API gives non-coherent mappings, and thus requires
>>> proper flushing.
>>> So a straight conversion like this is never legitimate.
>>
>> I would agree on proper dma api usage. However, the code is already
>> assuming coherent architecture by mapping the cpu pages as
>> page_kernel.
>>
>> Dma_map_page returns cached buffers and you don't need cache flushes
>> on coherent architecture to make the data visible.
> 
> All you are telling me is that there are two bugs instead of one, so now
> both need to be fixed.
> 

The removal of vmap also fixes the coherency assumption. It is one fix
for both.

I was thinking of submitting another patch to change the vmap argument
PAGE_KERNEL based on the coherency support of the architecture. I don't need
to do that anymore if the other experiment works.
Sinan Kaya April 18, 2016, 5:47 p.m. UTC | #20
On 4/18/2016 11:40 AM, Christoph Hellwig wrote:
> On Mon, Apr 18, 2016 at 11:21:12AM -0400, Sinan Kaya wrote:
>> I was looking at the code. I don't see how removing virt_to_page + vmap 
>> would solve the issue.
>>
>> The code is trying to access the buffer space with direct.buf member
>> from the CPU side. This member would become NULL, when this code is 
>> removed and also in mlx4_en_map_buffer. 
>>
>> ...
>>
>> What am I missing?
> 
> As mentioned before you'll also need to enforce you hit the nbufs = 1
> case for these.  In fact most callers should simply switch to a plain
> dma_zalloc_coherent call without all these wrappers.  If we have a case
> where we really want multiple buffers that don't have to be contiguous
> (maybe the MTT case) I'd rather opencode that instead of building this
> confusing interface on top of it.
> 

I hit the first problem with CQE. The alloc routine is allocating pages
but CQE code is trying to do linear access with direct buf member. 


I see that this code implements page_list support. I'd like to do the same
thing for CQE. Let me know if I'm in the right path.

static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor,
				u8 eqe_size)
Sinan Kaya April 19, 2016, 7:50 a.m. UTC | #21
On 4/18/2016 11:40 AM, Christoph Hellwig wrote:
> On Mon, Apr 18, 2016 at 11:21:12AM -0400, Sinan Kaya wrote:
>> I was looking at the code. I don't see how removing virt_to_page + vmap 
>> would solve the issue.
>>
>> The code is trying to access the buffer space with direct.buf member
>> from the CPU side. This member would become NULL, when this code is 
>> removed and also in mlx4_en_map_buffer. 
>>
>> ...
>>
>> What am I missing?
> 
> As mentioned before you'll also need to enforce you hit the nbufs = 1
> case for these.  In fact most callers should simply switch to a plain
> dma_zalloc_coherent call without all these wrappers.  If we have a case
> where we really want multiple buffers that don't have to be contiguous
> (maybe the MTT case) I'd rather opencode that instead of building this
> confusing interface on top of it.
> 

So, I did my fair share of investigation. As I pointed out in my previous email, 
the code is allocating a bunch of page sized arrays and using them for receive,
transmit and control descriptors. 

I'm unable to limit nbufs to 1 because, none of these allocations make a single
contiguous allocation by default. They all go to multiple page approach due to 
2 * PAGE_SIZE max_direct parameter passed. 

I tried changing the code to handle page_list vs. single allocation. I was able
to do this for CQE and receive queue since both of them allocate fixed size chunks. 
However, I couldn't do this for the transmit queue. 

The transmit code uses the array of descriptors for variable sized transfers and 
it also assumes that the descriptors are contiguous.

When used with pages, one tx data can spill beyond the first page and do illegal
writes.

In the end, my proposed code in this patch is much simpler than what I tried to 
achieve by removing vmap API. 

Another alternative is to force code use single DMA alloc for all 64 bit architectures.

Something like this:

-- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -588,7 +588,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 {
        dma_addr_t t;

-       if (size <= max_direct) {
+       if ((size <= max_direct) || (BITS_PER_LONG == 64)){
                buf->nbufs        = 1;
                buf->npages       = 1;
                buf->page_shift   = get_order(size) + PAGE_SHIFT;

This also works on arm64. My proposal is more scalable for memory consumption compared
to this one.
diff mbox

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index 0c51c69..22a7ae7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -618,13 +618,26 @@  int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 			return -ENOMEM;
 
 		for (i = 0; i < buf->nbufs; ++i) {
-			buf->page_list[i].buf =
-				dma_alloc_coherent(&dev->persist->pdev->dev,
-						   PAGE_SIZE,
-						   &t, gfp);
-			if (!buf->page_list[i].buf)
+			struct page *page;
+
+			page = alloc_page(GFP_KERNEL);
+			if (!page)
 				goto err_free;
 
+			t = dma_map_page(&dev->persist->pdev->dev, page, 0,
+					 PAGE_SIZE, DMA_BIDIRECTIONAL);
+
+			if (dma_mapping_error(&dev->persist->pdev->dev, t)) {
+				__free_page(page);
+				goto err_free;
+			}
+
+			buf->page_list[i].buf = page_address(page);
+			if (!buf->page_list[i].buf) {
+				__free_page(page);
+				goto err_free;
+			}
+
 			buf->page_list[i].map = t;
 
 			memset(buf->page_list[i].buf, 0, PAGE_SIZE);
@@ -666,11 +679,15 @@  void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 			vunmap(buf->direct.buf);
 
 		for (i = 0; i < buf->nbufs; ++i)
-			if (buf->page_list[i].buf)
-				dma_free_coherent(&dev->persist->pdev->dev,
-						  PAGE_SIZE,
-						  buf->page_list[i].buf,
-						  buf->page_list[i].map);
+			if (buf->page_list[i].buf) {
+				struct page *page;
+
+				page = virt_to_page(buf->page_list[i].buf);
+				dma_unmap_page(&dev->persist->pdev->dev,
+					       buf->page_list[i].map,
+					       PAGE_SIZE, DMA_BIDIRECTIONAL);
+				__free_page(page);
+			}
 		kfree(buf->page_list);
 	}
 }