diff mbox series

[net-next,13/15] eth: fbnic: add basic Rx handling

Message ID 171217496013.1598374.10126180029382922588.stgit@ahduyck-xeon-server.home.arpa (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series eth: fbnic: Add network driver for Meta Platforms Host Network Interface | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 943 this patch: 943
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 7 maintainers not CCed: john.fastabend@gmail.com ast@kernel.org daniel@iogearbox.net kernel-team@meta.com hawk@kernel.org edumazet@google.com bpf@vger.kernel.org
netdev/build_clang success Errors and warnings before: 956 this patch: 954
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 954 this patch: 954
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 459 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 4 this patch: 4
netdev/source_inline success Was 0 now: 0

Commit Message

Alexander H Duyck April 3, 2024, 8:09 p.m. UTC
From: Alexander Duyck <alexanderduyck@fb.com>

Handle Rx packets with basic csum and Rx hash offloads.

NIC writes back to the completion ring a head buffer descriptor
(data buffer allocated from header pages), variable number of payload
descriptors (data buffers in payload pages), an optional metadata
descriptor (type 3) and finally the primary metadata descriptor
(type 2).

This format makes scatter support fairly easy - start gathering
the pages when we see head page, gather until we see the primary
metadata descriptor, do the processing. Use XDP infra to collect
the packet fragments as we traverse the descriptors. XDP itself
is not supported yet, but it will be soon.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
---
 drivers/net/ethernet/meta/fbnic/fbnic_csr.h    |   60 ++++
 drivers/net/ethernet/meta/fbnic/fbnic_netdev.c |    4 
 drivers/net/ethernet/meta/fbnic/fbnic_pci.c    |    3 
 drivers/net/ethernet/meta/fbnic/fbnic_txrx.c   |  322 ++++++++++++++++++++++++
 drivers/net/ethernet/meta/fbnic/fbnic_txrx.h   |    2 
 5 files changed, 387 insertions(+), 4 deletions(-)

Comments

Yunsheng Lin April 9, 2024, 11:47 a.m. UTC | #1
On 2024/4/4 4:09, Alexander Duyck wrote:
> From: Alexander Duyck <alexanderduyck@fb.com>

...

> +
> +static int fbnic_clean_rcq(struct fbnic_napi_vector *nv,
> +			   struct fbnic_q_triad *qt, int budget)
> +{
> +	struct fbnic_ring *rcq = &qt->cmpl;
> +	struct fbnic_pkt_buff *pkt;
> +	s32 head0 = -1, head1 = -1;
> +	__le64 *raw_rcd, done;
> +	u32 head = rcq->head;
> +	u64 packets = 0;
> +
> +	done = (head & (rcq->size_mask + 1)) ? cpu_to_le64(FBNIC_RCD_DONE) : 0;
> +	raw_rcd = &rcq->desc[head & rcq->size_mask];
> +	pkt = rcq->pkt;
> +
> +	/* Walk the completion queue collecting the heads reported by NIC */
> +	while (likely(packets < budget)) {
> +		struct sk_buff *skb = ERR_PTR(-EINVAL);
> +		u64 rcd;
> +
> +		if ((*raw_rcd & cpu_to_le64(FBNIC_RCD_DONE)) == done)
> +			break;
> +
> +		dma_rmb();
> +
> +		rcd = le64_to_cpu(*raw_rcd);
> +
> +		switch (FIELD_GET(FBNIC_RCD_TYPE_MASK, rcd)) {
> +		case FBNIC_RCD_TYPE_HDR_AL:
> +			head0 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
> +			fbnic_pkt_prepare(nv, rcd, pkt, qt);
> +
> +			break;
> +		case FBNIC_RCD_TYPE_PAY_AL:
> +			head1 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
> +			fbnic_add_rx_frag(nv, rcd, pkt, qt);
> +
> +			break;
> +		case FBNIC_RCD_TYPE_OPT_META:
> +			/* Only type 0 is currently supported */
> +			if (FIELD_GET(FBNIC_RCD_OPT_META_TYPE_MASK, rcd))
> +				break;
> +
> +			/* We currently ignore the action table index */
> +			break;
> +		case FBNIC_RCD_TYPE_META:
> +			if (likely(!fbnic_rcd_metadata_err(rcd)))
> +				skb = fbnic_build_skb(nv, pkt);
> +
> +			/* populate skb and invalidate XDP */
> +			if (!IS_ERR_OR_NULL(skb)) {
> +				fbnic_populate_skb_fields(nv, rcd, skb, qt);
> +
> +				packets++;
> +
> +				napi_gro_receive(&nv->napi, skb);
> +			}
> +
> +			pkt->buff.data_hard_start = NULL;
> +
> +			break;
> +		}
> +
> +		raw_rcd++;
> +		head++;
> +		if (!(head & rcq->size_mask)) {
> +			done ^= cpu_to_le64(FBNIC_RCD_DONE);
> +			raw_rcd = &rcq->desc[0];
> +		}
> +	}
> +
> +	/* Unmap and free processed buffers */
> +	if (head0 >= 0)
> +		fbnic_clean_bdq(nv, budget, &qt->sub0, head0);
> +	fbnic_fill_bdq(nv, &qt->sub0);
> +
> +	if (head1 >= 0)
> +		fbnic_clean_bdq(nv, budget, &qt->sub1, head1);
> +	fbnic_fill_bdq(nv, &qt->sub1);

I am not sure how complicated the rx handling will be for the advanced
feature. For the current code, for each entry/desc in both qt->sub0 and
qt->sub1 at least need one page, and the page seems to be only used once
no matter however small the page is used?

I am assuming you want to do 'tightly optimized' operation for this by
calling page_pool_fragment_page(), but manipulating page->pp_ref_count
directly does not seems to add any value for the current code, but seem
to waste a lot of memory by not using the frag API, especially PAGE_SIZE
> 4K?

> +
> +	/* Record the current head/tail of the queue */
> +	if (rcq->head != head) {
> +		rcq->head = head;
> +		writel(head & rcq->size_mask, rcq->doorbell);
> +	}
> +
> +	return packets;
> +}
>  
>
Alexander H Duyck April 9, 2024, 3:08 p.m. UTC | #2
On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/4/4 4:09, Alexander Duyck wrote:
> > From: Alexander Duyck <alexanderduyck@fb.com>

[...]

> > +     /* Unmap and free processed buffers */
> > +     if (head0 >= 0)
> > +             fbnic_clean_bdq(nv, budget, &qt->sub0, head0);
> > +     fbnic_fill_bdq(nv, &qt->sub0);
> > +
> > +     if (head1 >= 0)
> > +             fbnic_clean_bdq(nv, budget, &qt->sub1, head1);
> > +     fbnic_fill_bdq(nv, &qt->sub1);
>
> I am not sure how complicated the rx handling will be for the advanced
> feature. For the current code, for each entry/desc in both qt->sub0 and
> qt->sub1 at least need one page, and the page seems to be only used once
> no matter however small the page is used?
>
> I am assuming you want to do 'tightly optimized' operation for this by
> calling page_pool_fragment_page(), but manipulating page->pp_ref_count
> directly does not seems to add any value for the current code, but seem
> to waste a lot of memory by not using the frag API, especially PAGE_SIZE
> > 4K?

On this hardware both the header and payload buffers are fragmentable.
The hardware decides the partitioning and we just follow it. So for
example it wouldn't be uncommon to have a jumbo frame split up such
that the header is less than 128B plus SKB overhead while the actual
data in the payload is just over 1400. So for us fragmenting the pages
is a very likely case especially with smaller packets.

It is better for us to optimize for the small packet scenario than
optimize for the case where 4K slices are getting taken. That way when
we are CPU constrained handling small packets we are the most
optimized whereas for the larger frames we can spare a few cycles to
account for the extra overhead. The result should be a higher overall
packets per second.
Yunsheng Lin April 10, 2024, 11:54 a.m. UTC | #3
On 2024/4/9 23:08, Alexander Duyck wrote:
> On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/4/4 4:09, Alexander Duyck wrote:
>>> From: Alexander Duyck <alexanderduyck@fb.com>
> 
> [...]
> 
>>> +     /* Unmap and free processed buffers */
>>> +     if (head0 >= 0)
>>> +             fbnic_clean_bdq(nv, budget, &qt->sub0, head0);
>>> +     fbnic_fill_bdq(nv, &qt->sub0);
>>> +
>>> +     if (head1 >= 0)
>>> +             fbnic_clean_bdq(nv, budget, &qt->sub1, head1);
>>> +     fbnic_fill_bdq(nv, &qt->sub1);
>>
>> I am not sure how complicated the rx handling will be for the advanced
>> feature. For the current code, for each entry/desc in both qt->sub0 and
>> qt->sub1 at least need one page, and the page seems to be only used once
>> no matter however small the page is used?
>>
>> I am assuming you want to do 'tightly optimized' operation for this by
>> calling page_pool_fragment_page(), but manipulating page->pp_ref_count
>> directly does not seems to add any value for the current code, but seem
>> to waste a lot of memory by not using the frag API, especially PAGE_SIZE
>>> 4K?
> 
> On this hardware both the header and payload buffers are fragmentable.
> The hardware decides the partitioning and we just follow it. So for
> example it wouldn't be uncommon to have a jumbo frame split up such
> that the header is less than 128B plus SKB overhead while the actual
> data in the payload is just over 1400. So for us fragmenting the pages
> is a very likely case especially with smaller packets.

I understand that is what you are trying to do, but the code above does
not seems to match the description, as the fbnic_clean_bdq() and
fbnic_fill_bdq() are called for qt->sub0 and qt->sub1, so the old pages
of qt->sub0 and qt->sub1 just cleaned are drained and refill each sub
with new pages, which does not seems to have any fragmenting?

The fragmenting can only happen when there is continuous small packet
coming from wire so that hw can report the same pg_id for different
packet with pg_offset before fbnic_clean_bdq() and fbnic_fill_bdq()
is called? I am not sure how to ensure that considering that we might
break out of while loop in fbnic_clean_rcq() because of 'packets < budget'
checking.

> 
> It is better for us to optimize for the small packet scenario than
> optimize for the case where 4K slices are getting taken. That way when
> we are CPU constrained handling small packets we are the most
> optimized whereas for the larger frames we can spare a few cycles to
> account for the extra overhead. The result should be a higher overall
> packets per second.

The problem is that small packet means low utilization of the bandwidth
as more bandwidth is used to send header instead of payload that is useful
for the user, so the question seems to be how often the small packet is
seen in the wire?

> .
>
Alexander H Duyck April 10, 2024, 3:03 p.m. UTC | #4
On Wed, Apr 10, 2024 at 4:54 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/4/9 23:08, Alexander Duyck wrote:
> > On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> On 2024/4/4 4:09, Alexander Duyck wrote:
> >>> From: Alexander Duyck <alexanderduyck@fb.com>
> >
> > [...]
> >
> >>> +     /* Unmap and free processed buffers */
> >>> +     if (head0 >= 0)
> >>> +             fbnic_clean_bdq(nv, budget, &qt->sub0, head0);
> >>> +     fbnic_fill_bdq(nv, &qt->sub0);
> >>> +
> >>> +     if (head1 >= 0)
> >>> +             fbnic_clean_bdq(nv, budget, &qt->sub1, head1);
> >>> +     fbnic_fill_bdq(nv, &qt->sub1);
> >>
> >> I am not sure how complicated the rx handling will be for the advanced
> >> feature. For the current code, for each entry/desc in both qt->sub0 and
> >> qt->sub1 at least need one page, and the page seems to be only used once
> >> no matter however small the page is used?
> >>
> >> I am assuming you want to do 'tightly optimized' operation for this by
> >> calling page_pool_fragment_page(), but manipulating page->pp_ref_count
> >> directly does not seems to add any value for the current code, but seem
> >> to waste a lot of memory by not using the frag API, especially PAGE_SIZE
> >>> 4K?
> >
> > On this hardware both the header and payload buffers are fragmentable.
> > The hardware decides the partitioning and we just follow it. So for
> > example it wouldn't be uncommon to have a jumbo frame split up such
> > that the header is less than 128B plus SKB overhead while the actual
> > data in the payload is just over 1400. So for us fragmenting the pages
> > is a very likely case especially with smaller packets.
>
> I understand that is what you are trying to do, but the code above does
> not seems to match the description, as the fbnic_clean_bdq() and
> fbnic_fill_bdq() are called for qt->sub0 and qt->sub1, so the old pages
> of qt->sub0 and qt->sub1 just cleaned are drained and refill each sub
> with new pages, which does not seems to have any fragmenting?

That is because it is all taken care of by the completion queue. Take
a look in fbnic_pkt_prepare. We are taking the buffer from the header
descriptor and taking a slice out of it there via fbnic_page_pool_get.
Basically we store the fragment count locally in the rx_buf and then
subtract what is leftover when the device is done with it.

> The fragmenting can only happen when there is continuous small packet
> coming from wire so that hw can report the same pg_id for different
> packet with pg_offset before fbnic_clean_bdq() and fbnic_fill_bdq()
> is called? I am not sure how to ensure that considering that we might
> break out of while loop in fbnic_clean_rcq() because of 'packets < budget'
> checking.

We don't free the page until we have moved one past it, or the
hardware has indicated it will take no more slices via a PAGE_FIN bit
in the descriptor.

> > It is better for us to optimize for the small packet scenario than
> > optimize for the case where 4K slices are getting taken. That way when
> > we are CPU constrained handling small packets we are the most
> > optimized whereas for the larger frames we can spare a few cycles to
> > account for the extra overhead. The result should be a higher overall
> > packets per second.
>
> The problem is that small packet means low utilization of the bandwidth
> as more bandwidth is used to send header instead of payload that is useful
> for the user, so the question seems to be how often the small packet is
> seen in the wire?

Very often. Especially when you are running something like servers
where the flow usually consists of an incoming request which is often
only a few hundred bytes, followed by us sending a response which then
leads to a flow of control frames for it.
Yunsheng Lin April 12, 2024, 8:43 a.m. UTC | #5
On 2024/4/10 23:03, Alexander Duyck wrote:
> On Wed, Apr 10, 2024 at 4:54 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/4/9 23:08, Alexander Duyck wrote:
>>> On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>>>
>>>> On 2024/4/4 4:09, Alexander Duyck wrote:
>>>>> From: Alexander Duyck <alexanderduyck@fb.com>
>>>
>>> [...]
>>>
>>>>> +     /* Unmap and free processed buffers */
>>>>> +     if (head0 >= 0)
>>>>> +             fbnic_clean_bdq(nv, budget, &qt->sub0, head0);
>>>>> +     fbnic_fill_bdq(nv, &qt->sub0);
>>>>> +
>>>>> +     if (head1 >= 0)
>>>>> +             fbnic_clean_bdq(nv, budget, &qt->sub1, head1);
>>>>> +     fbnic_fill_bdq(nv, &qt->sub1);
>>>>
>>>> I am not sure how complicated the rx handling will be for the advanced
>>>> feature. For the current code, for each entry/desc in both qt->sub0 and
>>>> qt->sub1 at least need one page, and the page seems to be only used once
>>>> no matter however small the page is used?
>>>>
>>>> I am assuming you want to do 'tightly optimized' operation for this by
>>>> calling page_pool_fragment_page(), but manipulating page->pp_ref_count
>>>> directly does not seems to add any value for the current code, but seem
>>>> to waste a lot of memory by not using the frag API, especially PAGE_SIZE
>>>>> 4K?
>>>
>>> On this hardware both the header and payload buffers are fragmentable.
>>> The hardware decides the partitioning and we just follow it. So for
>>> example it wouldn't be uncommon to have a jumbo frame split up such
>>> that the header is less than 128B plus SKB overhead while the actual
>>> data in the payload is just over 1400. So for us fragmenting the pages
>>> is a very likely case especially with smaller packets.
>>
>> I understand that is what you are trying to do, but the code above does
>> not seems to match the description, as the fbnic_clean_bdq() and
>> fbnic_fill_bdq() are called for qt->sub0 and qt->sub1, so the old pages
>> of qt->sub0 and qt->sub1 just cleaned are drained and refill each sub
>> with new pages, which does not seems to have any fragmenting?
> 
> That is because it is all taken care of by the completion queue. Take
> a look in fbnic_pkt_prepare. We are taking the buffer from the header
> descriptor and taking a slice out of it there via fbnic_page_pool_get.
> Basically we store the fragment count locally in the rx_buf and then
> subtract what is leftover when the device is done with it.

The above seems look a lot like the prepare/commit API in [1], the prepare
is done in fbnic_fill_bdq() and commit is done by fbnic_page_pool_get() in
fbnic_pkt_prepare() and fbnic_add_rx_frag().

If page_pool is able to provide a central place for pagecnt_bias of all the
fragmemts of the same page, we may provide a similar prepare/commit API for
frag API, I am not sure how to handle it for now.

From the below macro, this hw seems to be only able to handle 4K memory for
each entry/desc in qt->sub0 and qt->sub1, so there seems to be a lot of memory
that is unused for PAGE_SIZE > 4K as it is allocating memory based on page
granularity for each rx_buf in qt->sub0 and qt->sub1.

+#define FBNIC_RCD_AL_BUFF_OFF_MASK		DESC_GENMASK(43, 32)

It is still possible to reserve enough pagecnt_bias for each fragment, so that
the caller can still do its own fragmenting on fragment granularity as we
seems to have enough pagecnt_bias for each page.

If we provide a proper frag API to reserve enough pagecnt_bias for caller to
do its own fragmenting, then the memory waste may be avoided for this hw in
system with PAGE_SIZE > 4K.

1. https://lore.kernel.org/lkml/20240407130850.19625-10-linyunsheng@huawei.com/

> 
>> The fragmenting can only happen when there is continuous small packet
>> coming from wire so that hw can report the same pg_id for different
>> packet with pg_offset before fbnic_clean_bdq() and fbnic_fill_bdq()
>> is called? I am not sure how to ensure that considering that we might
>> break out of while loop in fbnic_clean_rcq() because of 'packets < budget'
>> checking.
> 
> We don't free the page until we have moved one past it, or the
> hardware has indicated it will take no more slices via a PAGE_FIN bit
> in the descriptor.


I look more closely at it, I am not able to figure it out how it is done
yet, as the PAGE_FIN bit mentioned above seems to be only used to calculate
the hdr_pg_end and truesize in fbnic_pkt_prepare() and fbnic_add_rx_frag().

For the below flow in fbnic_clean_rcq(), fbnic_clean_bdq() will be called
to drain the page in rx_buf just cleaned when head0/head1 >= 0, so I am not
sure how it do the fragmenting yet, am I missing something obvious here?

	while (likely(packets < budget)) {
		switch (FIELD_GET(FBNIC_RCD_TYPE_MASK, rcd)) {
		case FBNIC_RCD_TYPE_HDR_AL:
			head0 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
			fbnic_pkt_prepare(nv, rcd, pkt, qt);

			break;
		case FBNIC_RCD_TYPE_PAY_AL:
			head1 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
			fbnic_add_rx_frag(nv, rcd, pkt, qt);

			break;

		case FBNIC_RCD_TYPE_META:
			if (likely(!fbnic_rcd_metadata_err(rcd)))
				skb = fbnic_build_skb(nv, pkt);

			/* populate skb and invalidate XDP */
			if (!IS_ERR_OR_NULL(skb)) {
				fbnic_populate_skb_fields(nv, rcd, skb, qt);

				packets++;

				napi_gro_receive(&nv->napi, skb);
			}

			pkt->buff.data_hard_start = NULL;

			break;
		}

	/* Unmap and free processed buffers */
	if (head0 >= 0)
		fbnic_clean_bdq(nv, budget, &qt->sub0, head0);
	fbnic_fill_bdq(nv, &qt->sub0);

	if (head1 >= 0)
		fbnic_clean_bdq(nv, budget, &qt->sub1, head1);
	fbnic_fill_bdq(nv, &qt->sub1);
	
	}

> 
>>> It is better for us to optimize for the small packet scenario than
>>> optimize for the case where 4K slices are getting taken. That way when
>>> we are CPU constrained handling small packets we are the most
>>> optimized whereas for the larger frames we can spare a few cycles to
>>> account for the extra overhead. The result should be a higher overall
>>> packets per second.
>>
>> The problem is that small packet means low utilization of the bandwidth
>> as more bandwidth is used to send header instead of payload that is useful
>> for the user, so the question seems to be how often the small packet is
>> seen in the wire?
> 
> Very often. Especially when you are running something like servers
> where the flow usually consists of an incoming request which is often
> only a few hundred bytes, followed by us sending a response which then
> leads to a flow of control frames for it.

I think this is depending on the use case, if it is video streaming server,
I guess most of the packet is mtu-sized?

> .
>
Yunsheng Lin April 12, 2024, 9:47 a.m. UTC | #6
On 2024/4/12 16:43, Yunsheng Lin wrote:
> On 2024/4/10 23:03, Alexander Duyck wrote:

> 
> If we provide a proper frag API to reserve enough pagecnt_bias for caller to
> do its own fragmenting, then the memory waste may be avoided for this hw in
> system with PAGE_SIZE > 4K.
> 

Something like the page_pool_alloc_frag_bias() API below:

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 1d397c1a0043..018943307e68 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -92,6 +92,14 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
        return page_pool_alloc_pages(pool, gfp);
 }

+static inline struct page *page_pool_alloc_frag(struct page_pool *pool,
+                                               unsigned int *offset,
+                                               unsigned int size,
+                                               gfp_t gfp)
+{
+       return page_pool_alloc_frag_bias(pool, offset, size, 1U, gfp);
+}
+
 /**
  * page_pool_dev_alloc_frag() - allocate a page fragment.
  * @pool: pool from which to allocate
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 5e43a08d3231..2847b96264e5 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -202,8 +202,9 @@ struct page_pool {
 };

 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
-struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
-                                 unsigned int size, gfp_t gfp);
+struct page *page_pool_alloc_frag_bias(struct page_pool *pool,
+                                      unsigned int *offset, unsigned int size,
+                                      unsigned int bias, gfp_t gfp);
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
                                          int cpuid);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 4c175091fc0a..441b54473c35 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -850,9 +850,11 @@ static void page_pool_free_frag(struct page_pool *pool)
        page_pool_return_page(pool, page);
 }

-struct page *page_pool_alloc_frag(struct page_pool *pool,
-                                 unsigned int *offset,
-                                 unsigned int size, gfp_t gfp)
+struct page *page_pool_alloc_frag_bias(struct page_pool *pool,
+                                      unsigned int *offset,
+                                      unsigned int size,
+                                      unsigned int bias,
+                                      gfp_t gfp)
 {
        unsigned int max_size = PAGE_SIZE << pool->p.order;
        struct page *page = pool->frag_page;
@@ -881,19 +883,19 @@ struct page *page_pool_alloc_frag(struct page_pool *pool,
                pool->frag_page = page;

 frag_reset:
-               pool->frag_users = 1;
+               pool->frag_users = bias;
                *offset = 0;
                pool->frag_offset = size;
                page_pool_fragment_page(page, BIAS_MAX);
                return page;
        }

-       pool->frag_users++;
+       pool->frag_users += bias;
        pool->frag_offset = *offset + size;
        alloc_stat_inc(pool, fast);
        return page;
 }
-EXPORT_SYMBOL(page_pool_alloc_frag);
+EXPORT_SYMBOL(page_pool_alloc_frag_bias);

 static void page_pool_empty_ring(struct page_pool *pool)
 {
Alexander H Duyck April 12, 2024, 3:05 p.m. UTC | #7
On Fri, Apr 12, 2024 at 1:43 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/4/10 23:03, Alexander Duyck wrote:
> > On Wed, Apr 10, 2024 at 4:54 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>
> >> On 2024/4/9 23:08, Alexander Duyck wrote:
> >>> On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
> >>>>
> >>>> On 2024/4/4 4:09, Alexander Duyck wrote:
> >>>>> From: Alexander Duyck <alexanderduyck@fb.com>
> >>>
> >>> [...]
> >>>
> >>>>> +     /* Unmap and free processed buffers */
> >>>>> +     if (head0 >= 0)
> >>>>> +             fbnic_clean_bdq(nv, budget, &qt->sub0, head0);
> >>>>> +     fbnic_fill_bdq(nv, &qt->sub0);
> >>>>> +
> >>>>> +     if (head1 >= 0)
> >>>>> +             fbnic_clean_bdq(nv, budget, &qt->sub1, head1);
> >>>>> +     fbnic_fill_bdq(nv, &qt->sub1);
> >>>>
> >>>> I am not sure how complicated the rx handling will be for the advanced
> >>>> feature. For the current code, for each entry/desc in both qt->sub0 and
> >>>> qt->sub1 at least need one page, and the page seems to be only used once
> >>>> no matter however small the page is used?
> >>>>
> >>>> I am assuming you want to do 'tightly optimized' operation for this by
> >>>> calling page_pool_fragment_page(), but manipulating page->pp_ref_count
> >>>> directly does not seems to add any value for the current code, but seem
> >>>> to waste a lot of memory by not using the frag API, especially PAGE_SIZE
> >>>>> 4K?
> >>>
> >>> On this hardware both the header and payload buffers are fragmentable.
> >>> The hardware decides the partitioning and we just follow it. So for
> >>> example it wouldn't be uncommon to have a jumbo frame split up such
> >>> that the header is less than 128B plus SKB overhead while the actual
> >>> data in the payload is just over 1400. So for us fragmenting the pages
> >>> is a very likely case especially with smaller packets.
> >>
> >> I understand that is what you are trying to do, but the code above does
> >> not seems to match the description, as the fbnic_clean_bdq() and
> >> fbnic_fill_bdq() are called for qt->sub0 and qt->sub1, so the old pages
> >> of qt->sub0 and qt->sub1 just cleaned are drained and refill each sub
> >> with new pages, which does not seems to have any fragmenting?
> >
> > That is because it is all taken care of by the completion queue. Take
> > a look in fbnic_pkt_prepare. We are taking the buffer from the header
> > descriptor and taking a slice out of it there via fbnic_page_pool_get.
> > Basically we store the fragment count locally in the rx_buf and then
> > subtract what is leftover when the device is done with it.
>
> The above seems look a lot like the prepare/commit API in [1], the prepare
> is done in fbnic_fill_bdq() and commit is done by fbnic_page_pool_get() in
> fbnic_pkt_prepare() and fbnic_add_rx_frag().
>
> If page_pool is able to provide a central place for pagecnt_bias of all the
> fragmemts of the same page, we may provide a similar prepare/commit API for
> frag API, I am not sure how to handle it for now.
>
> From the below macro, this hw seems to be only able to handle 4K memory for
> each entry/desc in qt->sub0 and qt->sub1, so there seems to be a lot of memory
> that is unused for PAGE_SIZE > 4K as it is allocating memory based on page
> granularity for each rx_buf in qt->sub0 and qt->sub1.
>
> +#define FBNIC_RCD_AL_BUFF_OFF_MASK             DESC_GENMASK(43, 32)

The advantage of being a purpose built driver is that we aren't
running on any architectures where the PAGE_SIZE > 4K. If it came to
that we could probably look at splitting the pages within the
descriptors by simply having a single page span multiple descriptors.

> It is still possible to reserve enough pagecnt_bias for each fragment, so that
> the caller can still do its own fragmenting on fragment granularity as we
> seems to have enough pagecnt_bias for each page.
>
> If we provide a proper frag API to reserve enough pagecnt_bias for caller to
> do its own fragmenting, then the memory waste may be avoided for this hw in
> system with PAGE_SIZE > 4K.
>
> 1. https://lore.kernel.org/lkml/20240407130850.19625-10-linyunsheng@huawei.com/

That isn't a concern for us as we are only using the device on x86
systems at this time.

> >
> >> The fragmenting can only happen when there is continuous small packet
> >> coming from wire so that hw can report the same pg_id for different
> >> packet with pg_offset before fbnic_clean_bdq() and fbnic_fill_bdq()
> >> is called? I am not sure how to ensure that considering that we might
> >> break out of while loop in fbnic_clean_rcq() because of 'packets < budget'
> >> checking.
> >
> > We don't free the page until we have moved one past it, or the
> > hardware has indicated it will take no more slices via a PAGE_FIN bit
> > in the descriptor.
>
>
> I look more closely at it, I am not able to figure it out how it is done
> yet, as the PAGE_FIN bit mentioned above seems to be only used to calculate
> the hdr_pg_end and truesize in fbnic_pkt_prepare() and fbnic_add_rx_frag().
>
> For the below flow in fbnic_clean_rcq(), fbnic_clean_bdq() will be called
> to drain the page in rx_buf just cleaned when head0/head1 >= 0, so I am not
> sure how it do the fragmenting yet, am I missing something obvious here?
>
>         while (likely(packets < budget)) {
>                 switch (FIELD_GET(FBNIC_RCD_TYPE_MASK, rcd)) {
>                 case FBNIC_RCD_TYPE_HDR_AL:
>                         head0 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
>                         fbnic_pkt_prepare(nv, rcd, pkt, qt);
>
>                         break;
>                 case FBNIC_RCD_TYPE_PAY_AL:
>                         head1 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
>                         fbnic_add_rx_frag(nv, rcd, pkt, qt);
>
>                         break;
>
>                 case FBNIC_RCD_TYPE_META:
>                         if (likely(!fbnic_rcd_metadata_err(rcd)))
>                                 skb = fbnic_build_skb(nv, pkt);
>
>                         /* populate skb and invalidate XDP */
>                         if (!IS_ERR_OR_NULL(skb)) {
>                                 fbnic_populate_skb_fields(nv, rcd, skb, qt);
>
>                                 packets++;
>
>                                 napi_gro_receive(&nv->napi, skb);
>                         }
>
>                         pkt->buff.data_hard_start = NULL;
>
>                         break;
>                 }
>
>         /* Unmap and free processed buffers */
>         if (head0 >= 0)
>                 fbnic_clean_bdq(nv, budget, &qt->sub0, head0);
>         fbnic_fill_bdq(nv, &qt->sub0);
>
>         if (head1 >= 0)
>                 fbnic_clean_bdq(nv, budget, &qt->sub1, head1);
>         fbnic_fill_bdq(nv, &qt->sub1);
>
>         }

The cleanup logic cleans everything up to but not including the
head0/head1 offsets. So the pages are left on the ring until they are
fully consumed.

> >
> >>> It is better for us to optimize for the small packet scenario than
> >>> optimize for the case where 4K slices are getting taken. That way when
> >>> we are CPU constrained handling small packets we are the most
> >>> optimized whereas for the larger frames we can spare a few cycles to
> >>> account for the extra overhead. The result should be a higher overall
> >>> packets per second.
> >>
> >> The problem is that small packet means low utilization of the bandwidth
> >> as more bandwidth is used to send header instead of payload that is useful
> >> for the user, so the question seems to be how often the small packet is
> >> seen in the wire?
> >
> > Very often. Especially when you are running something like servers
> > where the flow usually consists of an incoming request which is often
> > only a few hundred bytes, followed by us sending a response which then
> > leads to a flow of control frames for it.
>
> I think this is depending on the use case, if it is video streaming server,
> I guess most of the packet is mtu-sized?

For the transmit side, yes. For the server side no. A typical TCP flow
has two sides two it. One sending SYN/ACK/FIN requests and the initial
get request and the other basically sending the response data.
Yunsheng Lin April 15, 2024, 1:19 p.m. UTC | #8
On 2024/4/12 23:05, Alexander Duyck wrote:

...

>>
>> From the below macro, this hw seems to be only able to handle 4K memory for
>> each entry/desc in qt->sub0 and qt->sub1, so there seems to be a lot of memory
>> that is unused for PAGE_SIZE > 4K as it is allocating memory based on page
>> granularity for each rx_buf in qt->sub0 and qt->sub1.
>>
>> +#define FBNIC_RCD_AL_BUFF_OFF_MASK             DESC_GENMASK(43, 32)
> 
> The advantage of being a purpose built driver is that we aren't
> running on any architectures where the PAGE_SIZE > 4K. If it came to

I am not sure if 'being a purpose built driver' argument is strong enough
here, at least the Kconfig does not seems to be suggesting it is a purpose
built driver, perhaps add a 'depend on' to suggest that?

> that we could probably look at splitting the pages within the
> descriptors by simply having a single page span multiple descriptors.

My point is that we might be able to meet the above use case with a proper
API without driver manipulating the reference counting by calling
page_pool_fragment_page() directly.
Alexander H Duyck April 15, 2024, 3:03 p.m. UTC | #9
On Mon, Apr 15, 2024 at 6:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/4/12 23:05, Alexander Duyck wrote:
>
> ...
>
> >>
> >> From the below macro, this hw seems to be only able to handle 4K memory for
> >> each entry/desc in qt->sub0 and qt->sub1, so there seems to be a lot of memory
> >> that is unused for PAGE_SIZE > 4K as it is allocating memory based on page
> >> granularity for each rx_buf in qt->sub0 and qt->sub1.
> >>
> >> +#define FBNIC_RCD_AL_BUFF_OFF_MASK             DESC_GENMASK(43, 32)
> >
> > The advantage of being a purpose built driver is that we aren't
> > running on any architectures where the PAGE_SIZE > 4K. If it came to
>
> I am not sure if 'being a purpose built driver' argument is strong enough
> here, at least the Kconfig does not seems to be suggesting it is a purpose
> built driver, perhaps add a 'depend on' to suggest that?

I'm not sure if you have been following the other threads. One of the
general thoughts of pushback against this driver was that Meta is
currently the only company that will have possession of this NIC. As
such Meta will be deciding what systems it goes into and as a result
of that we aren't likely to be running it on systems with 64K pages.

> > that we could probably look at splitting the pages within the
> > descriptors by simply having a single page span multiple descriptors.
>
> My point is that we might be able to meet the above use case with a proper
> API without driver manipulating the reference counting by calling
> page_pool_fragment_page() directly.

My suggestion would be to look at putting your proposed API together
as something that can be used by another driver. Once we hit that I
can then look at incorporating it into fbnic. One issue right now is
that the current patch set is meant to make use of existing APIs
instead of needing to rely on creating new ones as this isn't a device
others will have access to so it will make it harder to test any
proposed API based only on fbnic.
Jakub Kicinski April 15, 2024, 5:11 p.m. UTC | #10
On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote:
> > > The advantage of being a purpose built driver is that we aren't
> > > running on any architectures where the PAGE_SIZE > 4K. If it came to  
> >
> > I am not sure if 'being a purpose built driver' argument is strong enough
> > here, at least the Kconfig does not seems to be suggesting it is a purpose
> > built driver, perhaps add a 'depend on' to suggest that?  
> 
> I'm not sure if you have been following the other threads. One of the
> general thoughts of pushback against this driver was that Meta is
> currently the only company that will have possession of this NIC. As
> such Meta will be deciding what systems it goes into and as a result
> of that we aren't likely to be running it on systems with 64K pages.

Didn't take long for this argument to float to the surface..

We tried to write some rules with Paolo but haven't published them, yet.
Here is one that may be relevant:

  3. External contributions
  -------------------------

  Owners of drivers for private devices must not exhibit a stronger
  sense of ownership or push back on accepting code changes from
  members of the community. 3rd party contributions should be evaluated
  and eventually accepted, or challenged only on technical arguments
  based on the code itself. In particular, the argument that the owner
  is the only user and therefore knows best should not be used.

Not exactly a contribution, but we predicted the "we know best"
tone of the argument :(
Alexander H Duyck April 15, 2024, 6:03 p.m. UTC | #11
On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote:
> > > > The advantage of being a purpose built driver is that we aren't
> > > > running on any architectures where the PAGE_SIZE > 4K. If it came to
> > >
> > > I am not sure if 'being a purpose built driver' argument is strong enough
> > > here, at least the Kconfig does not seems to be suggesting it is a purpose
> > > built driver, perhaps add a 'depend on' to suggest that?
> >
> > I'm not sure if you have been following the other threads. One of the
> > general thoughts of pushback against this driver was that Meta is
> > currently the only company that will have possession of this NIC. As
> > such Meta will be deciding what systems it goes into and as a result
> > of that we aren't likely to be running it on systems with 64K pages.
>
> Didn't take long for this argument to float to the surface..

This wasn't my full argument. You truncated the part where I
specifically called out that it is hard to justify us pushing a
proprietary API that is only used by our driver.

> We tried to write some rules with Paolo but haven't published them, yet.
> Here is one that may be relevant:
>
>   3. External contributions
>   -------------------------
>
>   Owners of drivers for private devices must not exhibit a stronger
>   sense of ownership or push back on accepting code changes from
>   members of the community. 3rd party contributions should be evaluated
>   and eventually accepted, or challenged only on technical arguments
>   based on the code itself. In particular, the argument that the owner
>   is the only user and therefore knows best should not be used.
>
> Not exactly a contribution, but we predicted the "we know best"
> tone of the argument :(

The "we know best" is more of an "I know best" as someone who has
worked with page pool and the page fragment API since well before it
existed. My push back is based on the fact that we don't want to
allocate fragments, we want to allocate pages and fragment them
ourselves after the fact. As such it doesn't make much sense to add an
API that will have us trying to use the page fragment API which holds
onto the page when the expectation is that we will take the whole
thing and just fragment it ourselves.

If we are going to use the page fragment API we need the ability to
convert a page pool page to a fragment in place, not have it get
pulled into the page fragment API and then immediately yanked right
back out. On top of that we don't need to be making significant
changes to the API that will slow down all the other users to
accomodate a driver that will not be used by most users.

This is a case where I agree with Jiri. It doesn't make sense to slow
down all the other users of the page fragment API by making it so that
we can pull variable sliced batches from the page pool fragment
interface. It makes much more sense for us to just fragment in place
and add as little overhead to the other users of page pool APIs as
possible.
Jakub Kicinski April 15, 2024, 6:19 p.m. UTC | #12
On Mon, 15 Apr 2024 11:03:13 -0700 Alexander Duyck wrote:
> This wasn't my full argument. You truncated the part where I
> specifically called out that it is hard to justify us pushing a
> proprietary API that is only used by our driver.

I see. Please be careful when making such arguments, tho.

> The "we know best" is more of an "I know best" as someone who has
> worked with page pool and the page fragment API since well before it
> existed. My push back is based on the fact that we don't want to
> allocate fragments, we want to allocate pages and fragment them
> ourselves after the fact. As such it doesn't make much sense to add an
> API that will have us trying to use the page fragment API which holds
> onto the page when the expectation is that we will take the whole
> thing and just fragment it ourselves.

To be clear I'm not arguing for the exact use of the API as suggested.
Or even that we should support this in the shared API. One would
probably have to take a stab at coding it up to find out what works
best. My first try FWIW would be to mask off the low bits of the
page index, eg. for 64k page making entries 0-15 all use rx_buf 
index 0...
Alexander H Duyck April 15, 2024, 6:55 p.m. UTC | #13
On Mon, Apr 15, 2024 at 11:19 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Mon, 15 Apr 2024 11:03:13 -0700 Alexander Duyck wrote:
> > This wasn't my full argument. You truncated the part where I
> > specifically called out that it is hard to justify us pushing a
> > proprietary API that is only used by our driver.
>
> I see. Please be careful when making such arguments, tho.
>
> > The "we know best" is more of an "I know best" as someone who has
> > worked with page pool and the page fragment API since well before it
> > existed. My push back is based on the fact that we don't want to
> > allocate fragments, we want to allocate pages and fragment them
> > ourselves after the fact. As such it doesn't make much sense to add an
> > API that will have us trying to use the page fragment API which holds
> > onto the page when the expectation is that we will take the whole
> > thing and just fragment it ourselves.
>
> To be clear I'm not arguing for the exact use of the API as suggested.
> Or even that we should support this in the shared API. One would
> probably have to take a stab at coding it up to find out what works
> best. My first try FWIW would be to mask off the low bits of the
> page index, eg. for 64k page making entries 0-15 all use rx_buf
> index 0...

It would take a few more changes to make it all work. Basically we
would need to map the page into every descriptor entry since the worst
case scenario would be that somehow we end up with things getting so
tight that the page is only partially mapped and we are working
through it as a subset of 4K slices with some at the beginning being
unmapped from the descriptor ring while some are still waiting to be
assigned to a descriptor and used. What I would probably have to look
at doing is adding some sort of cache on the ring to hold onto it
while we dole it out 4K at a time to the descriptors. Either that or
enforce a hard 16 descriptor limit where we have to assign a full page
with every allocation meaning we are at a higher risk for starving the
device for memory.

The bigger issue would be how could we test it? This is an OCP NIC and
as far as I am aware we don't have any systems available that would
support a 64K page. I suppose I could rebuild the QEMU for an
architecture that supports 64K pages and test it. It would just be
painful to have to set up a virtual system to test code that would
literally never be used again. I am not sure QEMU can generate enough
stress to really test the page allocator and make sure all corner
cases are covered.
Jakub Kicinski April 15, 2024, 10:01 p.m. UTC | #14
On Mon, 15 Apr 2024 11:55:37 -0700 Alexander Duyck wrote:
> It would take a few more changes to make it all work. Basically we
> would need to map the page into every descriptor entry since the worst
> case scenario would be that somehow we end up with things getting so
> tight that the page is only partially mapped and we are working
> through it as a subset of 4K slices with some at the beginning being
> unmapped from the descriptor ring while some are still waiting to be
> assigned to a descriptor and used. What I would probably have to look
> at doing is adding some sort of cache on the ring to hold onto it
> while we dole it out 4K at a time to the descriptors. Either that or
> enforce a hard 16 descriptor limit where we have to assign a full page
> with every allocation meaning we are at a higher risk for starving the
> device for memory.

Hm, that would be more work, indeed, but potentially beneficial. I was
thinking of separating the page allocation and draining logic a bit
from the fragment handling logic.

#define RXPAGE_IDX(idx)		((idx) >> PAGE_SHIFT - 12)

in fbnic_clean_bdq():

	while (RXPAGE_IDX(head) != RXPAGE_IDX(hw_head))

refer to rx_buf as:

	struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx >> LOSE_BITS];

Refill always works in batches of multiple of PAGE_SIZE / 4k.

> The bigger issue would be how could we test it? This is an OCP NIC and
> as far as I am aware we don't have any systems available that would
> support a 64K page. I suppose I could rebuild the QEMU for an
> architecture that supports 64K pages and test it. It would just be
> painful to have to set up a virtual system to test code that would
> literally never be used again. I am not sure QEMU can generate enough
> stress to really test the page allocator and make sure all corner
> cases are covered.

The testing may be tricky. We could possibly test with hacking up the
driver to use compound pages (say always allocate 16k) and making sure
we don't refer to PAGE_SIZE directly in the test.

BTW I have a spreadsheet of "promises", I'd be fine if we set a
deadline for FBNIC to gain support for PAGE_SIZE != 4k and Kconfig 
to x86-only for now..
Alexander H Duyck April 15, 2024, 11:57 p.m. UTC | #15
On Mon, Apr 15, 2024 at 3:01 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Mon, 15 Apr 2024 11:55:37 -0700 Alexander Duyck wrote:
> > It would take a few more changes to make it all work. Basically we
> > would need to map the page into every descriptor entry since the worst
> > case scenario would be that somehow we end up with things getting so
> > tight that the page is only partially mapped and we are working
> > through it as a subset of 4K slices with some at the beginning being
> > unmapped from the descriptor ring while some are still waiting to be
> > assigned to a descriptor and used. What I would probably have to look
> > at doing is adding some sort of cache on the ring to hold onto it
> > while we dole it out 4K at a time to the descriptors. Either that or
> > enforce a hard 16 descriptor limit where we have to assign a full page
> > with every allocation meaning we are at a higher risk for starving the
> > device for memory.
>
> Hm, that would be more work, indeed, but potentially beneficial. I was
> thinking of separating the page allocation and draining logic a bit
> from the fragment handling logic.
>
> #define RXPAGE_IDX(idx)         ((idx) >> PAGE_SHIFT - 12)
>
> in fbnic_clean_bdq():
>
>         while (RXPAGE_IDX(head) != RXPAGE_IDX(hw_head))
>
> refer to rx_buf as:
>
>         struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx >> LOSE_BITS];
>
> Refill always works in batches of multiple of PAGE_SIZE / 4k.
>
> > The bigger issue would be how could we test it? This is an OCP NIC and
> > as far as I am aware we don't have any systems available that would
> > support a 64K page. I suppose I could rebuild the QEMU for an
> > architecture that supports 64K pages and test it. It would just be
> > painful to have to set up a virtual system to test code that would
> > literally never be used again. I am not sure QEMU can generate enough
> > stress to really test the page allocator and make sure all corner
> > cases are covered.
>
> The testing may be tricky. We could possibly test with hacking up the
> driver to use compound pages (say always allocate 16k) and making sure
> we don't refer to PAGE_SIZE directly in the test.
>
> BTW I have a spreadsheet of "promises", I'd be fine if we set a
> deadline for FBNIC to gain support for PAGE_SIZE != 4k and Kconfig
> to x86-only for now..

Why set a deadline? It doesn't make sense to add as a feature for now.

I would be fine with limiting it to x86-only and then stating that if
we need to change it to add support for an architecture that does
support !4K page size then we can cross that bridge when we get there
as it would be much more likely that we would have access to a
platform to test it on rather than adding overhead to the code to
support a setup that this device may never see in its lifetime.
Jakub Kicinski April 16, 2024, 12:24 a.m. UTC | #16
On Mon, 15 Apr 2024 16:57:54 -0700 Alexander Duyck wrote:
> > The testing may be tricky. We could possibly test with hacking up the
> > driver to use compound pages (say always allocate 16k) and making sure
> > we don't refer to PAGE_SIZE directly in the test.
> >
> > BTW I have a spreadsheet of "promises", I'd be fine if we set a
> > deadline for FBNIC to gain support for PAGE_SIZE != 4k and Kconfig
> > to x86-only for now..  
> 
> Why set a deadline? It doesn't make sense to add as a feature for now.

Okay, maybe I'm trying to be too nice. Please have all the feedback
addressed in v2.
Yunsheng Lin April 16, 2024, 1:25 p.m. UTC | #17
On 2024/4/16 6:01, Jakub Kicinski wrote:
> On Mon, 15 Apr 2024 11:55:37 -0700 Alexander Duyck wrote:
>> It would take a few more changes to make it all work. Basically we
>> would need to map the page into every descriptor entry since the worst
>> case scenario would be that somehow we end up with things getting so
>> tight that the page is only partially mapped and we are working
>> through it as a subset of 4K slices with some at the beginning being
>> unmapped from the descriptor ring while some are still waiting to be
>> assigned to a descriptor and used. What I would probably have to look
>> at doing is adding some sort of cache on the ring to hold onto it
>> while we dole it out 4K at a time to the descriptors. Either that or
>> enforce a hard 16 descriptor limit where we have to assign a full page
>> with every allocation meaning we are at a higher risk for starving the
>> device for memory.
> 
> Hm, that would be more work, indeed, but potentially beneficial. I was
> thinking of separating the page allocation and draining logic a bit
> from the fragment handling logic.
> 
> #define RXPAGE_IDX(idx)		((idx) >> PAGE_SHIFT - 12)
> 
> in fbnic_clean_bdq():
> 
> 	while (RXPAGE_IDX(head) != RXPAGE_IDX(hw_head))
> 
> refer to rx_buf as:
> 
> 	struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx >> LOSE_BITS];
> 
> Refill always works in batches of multiple of PAGE_SIZE / 4k.

Are we expecting drivers wanting best possible performance doing the
above duplicated trick?

"grep -rn '_reuse_' drivers/net/ethernet/" seems to suggest that we
already have similar trick to do the page spliting in a lot of drivers,
I would rather we do not duplicate the above trick again.
Alexander Lobakin April 16, 2024, 2:05 p.m. UTC | #18
From: Alexander Duyck <alexander.duyck@gmail.com>
Date: Mon, 15 Apr 2024 11:03:13 -0700

> On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote:
>>
>> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote:
>>>>> The advantage of being a purpose built driver is that we aren't
>>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to
>>>>
>>>> I am not sure if 'being a purpose built driver' argument is strong enough
>>>> here, at least the Kconfig does not seems to be suggesting it is a purpose
>>>> built driver, perhaps add a 'depend on' to suggest that?
>>>
>>> I'm not sure if you have been following the other threads. One of the
>>> general thoughts of pushback against this driver was that Meta is
>>> currently the only company that will have possession of this NIC. As
>>> such Meta will be deciding what systems it goes into and as a result
>>> of that we aren't likely to be running it on systems with 64K pages.
>>
>> Didn't take long for this argument to float to the surface..
> 
> This wasn't my full argument. You truncated the part where I
> specifically called out that it is hard to justify us pushing a
> proprietary API that is only used by our driver.
> 
>> We tried to write some rules with Paolo but haven't published them, yet.
>> Here is one that may be relevant:
>>
>>   3. External contributions
>>   -------------------------
>>
>>   Owners of drivers for private devices must not exhibit a stronger
>>   sense of ownership or push back on accepting code changes from
>>   members of the community. 3rd party contributions should be evaluated
>>   and eventually accepted, or challenged only on technical arguments
>>   based on the code itself. In particular, the argument that the owner
>>   is the only user and therefore knows best should not be used.
>>
>> Not exactly a contribution, but we predicted the "we know best"
>> tone of the argument :(
> 
> The "we know best" is more of an "I know best" as someone who has
> worked with page pool and the page fragment API since well before it
> existed. My push back is based on the fact that we don't want to

I still strongly believe Jesper-style arguments like "I've been working
with this for aeons", "I invented the Internet", "I was born 3 decades
before this API was introduced" are not valid arguments.

> allocate fragments, we want to allocate pages and fragment them
> ourselves after the fact. As such it doesn't make much sense to add an
> API that will have us trying to use the page fragment API which holds
> onto the page when the expectation is that we will take the whole
> thing and just fragment it ourselves.

[...]

Re "this HW works only on x86, why bother" -- I still believe there
shouldn't be any hardcodes in any driver based on the fact that the HW
is deployed only on particular systems. Page sizes, Endianness,
32/64-bit... It's not difficult to make a driver look like it's
universal and could work anywhere, really.

Thanks,
Olek
Alexander H Duyck April 16, 2024, 2:35 p.m. UTC | #19
On Tue, Apr 16, 2024 at 6:25 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/4/16 6:01, Jakub Kicinski wrote:
> > On Mon, 15 Apr 2024 11:55:37 -0700 Alexander Duyck wrote:
> >> It would take a few more changes to make it all work. Basically we
> >> would need to map the page into every descriptor entry since the worst
> >> case scenario would be that somehow we end up with things getting so
> >> tight that the page is only partially mapped and we are working
> >> through it as a subset of 4K slices with some at the beginning being
> >> unmapped from the descriptor ring while some are still waiting to be
> >> assigned to a descriptor and used. What I would probably have to look
> >> at doing is adding some sort of cache on the ring to hold onto it
> >> while we dole it out 4K at a time to the descriptors. Either that or
> >> enforce a hard 16 descriptor limit where we have to assign a full page
> >> with every allocation meaning we are at a higher risk for starving the
> >> device for memory.
> >
> > Hm, that would be more work, indeed, but potentially beneficial. I was
> > thinking of separating the page allocation and draining logic a bit
> > from the fragment handling logic.
> >
> > #define RXPAGE_IDX(idx)               ((idx) >> PAGE_SHIFT - 12)
> >
> > in fbnic_clean_bdq():
> >
> >       while (RXPAGE_IDX(head) != RXPAGE_IDX(hw_head))
> >
> > refer to rx_buf as:
> >
> >       struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx >> LOSE_BITS];
> >
> > Refill always works in batches of multiple of PAGE_SIZE / 4k.
>
> Are we expecting drivers wanting best possible performance doing the
> above duplicated trick?
>
> "grep -rn '_reuse_' drivers/net/ethernet/" seems to suggest that we
> already have similar trick to do the page spliting in a lot of drivers,
> I would rather we do not duplicate the above trick again.

Then why not focus on those drivers? You may have missed the whole
point but it isn't possible to test this device on a system with 64K
pages currently. There aren't any platforms we can drop the device
into that support that.
Alexander H Duyck April 16, 2024, 2:46 p.m. UTC | #20
On Tue, Apr 16, 2024 at 7:05 AM Alexander Lobakin
<aleksander.lobakin@intel.com> wrote:
>
> From: Alexander Duyck <alexander.duyck@gmail.com>
> Date: Mon, 15 Apr 2024 11:03:13 -0700
>
> > On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote:
> >>
> >> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote:
> >>>>> The advantage of being a purpose built driver is that we aren't
> >>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to
> >>>>
> >>>> I am not sure if 'being a purpose built driver' argument is strong enough
> >>>> here, at least the Kconfig does not seems to be suggesting it is a purpose
> >>>> built driver, perhaps add a 'depend on' to suggest that?
> >>>
> >>> I'm not sure if you have been following the other threads. One of the
> >>> general thoughts of pushback against this driver was that Meta is
> >>> currently the only company that will have possession of this NIC. As
> >>> such Meta will be deciding what systems it goes into and as a result
> >>> of that we aren't likely to be running it on systems with 64K pages.
> >>
> >> Didn't take long for this argument to float to the surface..
> >
> > This wasn't my full argument. You truncated the part where I
> > specifically called out that it is hard to justify us pushing a
> > proprietary API that is only used by our driver.
> >
> >> We tried to write some rules with Paolo but haven't published them, yet.
> >> Here is one that may be relevant:
> >>
> >>   3. External contributions
> >>   -------------------------
> >>
> >>   Owners of drivers for private devices must not exhibit a stronger
> >>   sense of ownership or push back on accepting code changes from
> >>   members of the community. 3rd party contributions should be evaluated
> >>   and eventually accepted, or challenged only on technical arguments
> >>   based on the code itself. In particular, the argument that the owner
> >>   is the only user and therefore knows best should not be used.
> >>
> >> Not exactly a contribution, but we predicted the "we know best"
> >> tone of the argument :(
> >
> > The "we know best" is more of an "I know best" as someone who has
> > worked with page pool and the page fragment API since well before it
> > existed. My push back is based on the fact that we don't want to
>
> I still strongly believe Jesper-style arguments like "I've been working
> with this for aeons", "I invented the Internet", "I was born 3 decades
> before this API was introduced" are not valid arguments.

Sorry that is a bit of my frustration with Yunsheng coming through. He
has another patch set that mostly just moves my code and made himself
the maintainer. Admittedly I am a bit annoyed with that. Especially
since the main drive seems to be to force everything to use that one
approach and then optimize for his use case for vhost net over all
others most likely at the expense of everything else.

It seems like it is the very thing we were complaining about in patch
0 with other drivers getting penalized at the cost of optimizing for
one specific driver.

> > allocate fragments, we want to allocate pages and fragment them
> > ourselves after the fact. As such it doesn't make much sense to add an
> > API that will have us trying to use the page fragment API which holds
> > onto the page when the expectation is that we will take the whole
> > thing and just fragment it ourselves.
>
> [...]
>
> Re "this HW works only on x86, why bother" -- I still believe there
> shouldn't be any hardcodes in any driver based on the fact that the HW
> is deployed only on particular systems. Page sizes, Endianness,
> 32/64-bit... It's not difficult to make a driver look like it's
> universal and could work anywhere, really.

It isn't that this only works on x86. It is that we can only test it
on x86. The biggest issue right now is that I wouldn't have any
systems w/ 64K pages that I could test on, and the worst that could
happen based on the current code is that the NIC driver will be a
memory hog.

I would much prefer the potential of being a memory hog on an untested
hardware over implementing said code untested and introducing
something like a memory leak or page fault issue.

That is why I am more than willing to make this an x86_64 only driver
for now and we can look at expanding out as I get time and get
equipment to to add support and test for other architectures.
Andrew Lunn April 16, 2024, 6:26 p.m. UTC | #21
> Sorry that is a bit of my frustration with Yunsheng coming through. He
> has another patch set that mostly just moves my code and made himself
> the maintainer. Admittedly I am a bit annoyed with that. Especially
> since the main drive seems to be to force everything to use that one
> approach and then optimize for his use case for vhost net over all
> others most likely at the expense of everything else.

That is why we ask for benchmarks for "optimization patches". If they
don't actually provide any improvements, or degrade other use cases,
they get rejected.

> That is why I am more than willing to make this an x86_64 only driver
> for now and we can look at expanding out as I get time and get
> equipment to to add support and test for other architectures.

That sounds reasonable to me. But i would allow COMPILE_TEST for other
architectures, just to keep the number of surprises low when you do
have other architectures to test with.

     Andrew
Leon Romanovsky April 17, 2024, 8:14 a.m. UTC | #22
On Tue, Apr 16, 2024 at 07:46:06AM -0700, Alexander Duyck wrote:
> On Tue, Apr 16, 2024 at 7:05 AM Alexander Lobakin
> <aleksander.lobakin@intel.com> wrote:
> >
> > From: Alexander Duyck <alexander.duyck@gmail.com>
> > Date: Mon, 15 Apr 2024 11:03:13 -0700
> >
> > > On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > >>
> > >> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote:
> > >>>>> The advantage of being a purpose built driver is that we aren't
> > >>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to
> > >>>>
> > >>>> I am not sure if 'being a purpose built driver' argument is strong enough
> > >>>> here, at least the Kconfig does not seems to be suggesting it is a purpose
> > >>>> built driver, perhaps add a 'depend on' to suggest that?
> > >>>
> > >>> I'm not sure if you have been following the other threads. One of the
> > >>> general thoughts of pushback against this driver was that Meta is
> > >>> currently the only company that will have possession of this NIC. As
> > >>> such Meta will be deciding what systems it goes into and as a result
> > >>> of that we aren't likely to be running it on systems with 64K pages.
> > >>
> > >> Didn't take long for this argument to float to the surface..
> > >
> > > This wasn't my full argument. You truncated the part where I
> > > specifically called out that it is hard to justify us pushing a
> > > proprietary API that is only used by our driver.
> > >
> > >> We tried to write some rules with Paolo but haven't published them, yet.
> > >> Here is one that may be relevant:
> > >>
> > >>   3. External contributions
> > >>   -------------------------
> > >>
> > >>   Owners of drivers for private devices must not exhibit a stronger
> > >>   sense of ownership or push back on accepting code changes from
> > >>   members of the community. 3rd party contributions should be evaluated
> > >>   and eventually accepted, or challenged only on technical arguments
> > >>   based on the code itself. In particular, the argument that the owner
> > >>   is the only user and therefore knows best should not be used.
> > >>
> > >> Not exactly a contribution, but we predicted the "we know best"
> > >> tone of the argument :(
> > >
> > > The "we know best" is more of an "I know best" as someone who has
> > > worked with page pool and the page fragment API since well before it
> > > existed. My push back is based on the fact that we don't want to
> >
> > I still strongly believe Jesper-style arguments like "I've been working
> > with this for aeons", "I invented the Internet", "I was born 3 decades
> > before this API was introduced" are not valid arguments.
> 
> Sorry that is a bit of my frustration with Yunsheng coming through. He
> has another patch set that mostly just moves my code and made himself
> the maintainer. Admittedly I am a bit annoyed with that. Especially
> since the main drive seems to be to force everything to use that one
> approach and then optimize for his use case for vhost net over all
> others most likely at the expense of everything else.
> 
> It seems like it is the very thing we were complaining about in patch
> 0 with other drivers getting penalized at the cost of optimizing for
> one specific driver.
> 
> > > allocate fragments, we want to allocate pages and fragment them
> > > ourselves after the fact. As such it doesn't make much sense to add an
> > > API that will have us trying to use the page fragment API which holds
> > > onto the page when the expectation is that we will take the whole
> > > thing and just fragment it ourselves.
> >
> > [...]
> >
> > Re "this HW works only on x86, why bother" -- I still believe there
> > shouldn't be any hardcodes in any driver based on the fact that the HW
> > is deployed only on particular systems. Page sizes, Endianness,
> > 32/64-bit... It's not difficult to make a driver look like it's
> > universal and could work anywhere, really.
> 
> It isn't that this only works on x86. It is that we can only test it
> on x86. The biggest issue right now is that I wouldn't have any
> systems w/ 64K pages that I could test on.

Didn't you write that you will provide QEMU emulation for this device?

Thanks
Alexander Lobakin April 17, 2024, 10:39 a.m. UTC | #23
From: Alexander Duyck <alexander.duyck@gmail.com>
Date: Tue, 16 Apr 2024 07:46:06 -0700

> On Tue, Apr 16, 2024 at 7:05 AM Alexander Lobakin
> <aleksander.lobakin@intel.com> wrote:
>>
>> From: Alexander Duyck <alexander.duyck@gmail.com>
>> Date: Mon, 15 Apr 2024 11:03:13 -0700
>>
>>> On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote:
>>>>
>>>> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote:
>>>>>>> The advantage of being a purpose built driver is that we aren't
>>>>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to
>>>>>>
>>>>>> I am not sure if 'being a purpose built driver' argument is strong enough
>>>>>> here, at least the Kconfig does not seems to be suggesting it is a purpose
>>>>>> built driver, perhaps add a 'depend on' to suggest that?
>>>>>
>>>>> I'm not sure if you have been following the other threads. One of the
>>>>> general thoughts of pushback against this driver was that Meta is
>>>>> currently the only company that will have possession of this NIC. As
>>>>> such Meta will be deciding what systems it goes into and as a result
>>>>> of that we aren't likely to be running it on systems with 64K pages.
>>>>
>>>> Didn't take long for this argument to float to the surface..
>>>
>>> This wasn't my full argument. You truncated the part where I
>>> specifically called out that it is hard to justify us pushing a
>>> proprietary API that is only used by our driver.
>>>
>>>> We tried to write some rules with Paolo but haven't published them, yet.
>>>> Here is one that may be relevant:
>>>>
>>>>   3. External contributions
>>>>   -------------------------
>>>>
>>>>   Owners of drivers for private devices must not exhibit a stronger
>>>>   sense of ownership or push back on accepting code changes from
>>>>   members of the community. 3rd party contributions should be evaluated
>>>>   and eventually accepted, or challenged only on technical arguments
>>>>   based on the code itself. In particular, the argument that the owner
>>>>   is the only user and therefore knows best should not be used.
>>>>
>>>> Not exactly a contribution, but we predicted the "we know best"
>>>> tone of the argument :(
>>>
>>> The "we know best" is more of an "I know best" as someone who has
>>> worked with page pool and the page fragment API since well before it
>>> existed. My push back is based on the fact that we don't want to
>>
>> I still strongly believe Jesper-style arguments like "I've been working
>> with this for aeons", "I invented the Internet", "I was born 3 decades
>> before this API was introduced" are not valid arguments.
> 
> Sorry that is a bit of my frustration with Yunsheng coming through. He
> has another patch set that mostly just moves my code and made himself
> the maintainer. Admittedly I am a bit annoyed with that. Especially
> since the main drive seems to be to force everything to use that one
> approach and then optimize for his use case for vhost net over all
> others most likely at the expense of everything else.
> 
> It seems like it is the very thing we were complaining about in patch
> 0 with other drivers getting penalized at the cost of optimizing for
> one specific driver.
> 
>>> allocate fragments, we want to allocate pages and fragment them
>>> ourselves after the fact. As such it doesn't make much sense to add an
>>> API that will have us trying to use the page fragment API which holds
>>> onto the page when the expectation is that we will take the whole
>>> thing and just fragment it ourselves.
>>
>> [...]
>>
>> Re "this HW works only on x86, why bother" -- I still believe there
>> shouldn't be any hardcodes in any driver based on the fact that the HW
>> is deployed only on particular systems. Page sizes, Endianness,
>> 32/64-bit... It's not difficult to make a driver look like it's
>> universal and could work anywhere, really.
> 
> It isn't that this only works on x86. It is that we can only test it
> on x86. The biggest issue right now is that I wouldn't have any
> systems w/ 64K pages that I could test on, and the worst that could
> happen based on the current code is that the NIC driver will be a
> memory hog.
> 
> I would much prefer the potential of being a memory hog on an untested
> hardware over implementing said code untested and introducing
> something like a memory leak or page fault issue.
> 
> That is why I am more than willing to make this an x86_64 only driver
> for now and we can look at expanding out as I get time and get
> equipment to to add support and test for other architectures.

I don't see any issue to not limit it to x86_64 only. It compiles just
fine and if you run it later on a non-x86_64 system, you'll test it
then. I don't think anyone will run it on a different platform prior to
that.

Thanks,
Olek
Alexander H Duyck April 17, 2024, 4:09 p.m. UTC | #24
On Wed, Apr 17, 2024 at 1:14 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> On Tue, Apr 16, 2024 at 07:46:06AM -0700, Alexander Duyck wrote:
> > On Tue, Apr 16, 2024 at 7:05 AM Alexander Lobakin
> > <aleksander.lobakin@intel.com> wrote:
> > >
> > > From: Alexander Duyck <alexander.duyck@gmail.com>
> > > Date: Mon, 15 Apr 2024 11:03:13 -0700
> > >
> > > > On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > > >>
> > > >> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote:
> > > >>>>> The advantage of being a purpose built driver is that we aren't
> > > >>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to
> > > >>>>
> > > >>>> I am not sure if 'being a purpose built driver' argument is strong enough
> > > >>>> here, at least the Kconfig does not seems to be suggesting it is a purpose
> > > >>>> built driver, perhaps add a 'depend on' to suggest that?
> > > >>>
> > > >>> I'm not sure if you have been following the other threads. One of the
> > > >>> general thoughts of pushback against this driver was that Meta is
> > > >>> currently the only company that will have possession of this NIC. As
> > > >>> such Meta will be deciding what systems it goes into and as a result
> > > >>> of that we aren't likely to be running it on systems with 64K pages.
> > > >>
> > > >> Didn't take long for this argument to float to the surface..
> > > >
> > > > This wasn't my full argument. You truncated the part where I
> > > > specifically called out that it is hard to justify us pushing a
> > > > proprietary API that is only used by our driver.
> > > >
> > > >> We tried to write some rules with Paolo but haven't published them, yet.
> > > >> Here is one that may be relevant:
> > > >>
> > > >>   3. External contributions
> > > >>   -------------------------
> > > >>
> > > >>   Owners of drivers for private devices must not exhibit a stronger
> > > >>   sense of ownership or push back on accepting code changes from
> > > >>   members of the community. 3rd party contributions should be evaluated
> > > >>   and eventually accepted, or challenged only on technical arguments
> > > >>   based on the code itself. In particular, the argument that the owner
> > > >>   is the only user and therefore knows best should not be used.
> > > >>
> > > >> Not exactly a contribution, but we predicted the "we know best"
> > > >> tone of the argument :(
> > > >
> > > > The "we know best" is more of an "I know best" as someone who has
> > > > worked with page pool and the page fragment API since well before it
> > > > existed. My push back is based on the fact that we don't want to
> > >
> > > I still strongly believe Jesper-style arguments like "I've been working
> > > with this for aeons", "I invented the Internet", "I was born 3 decades
> > > before this API was introduced" are not valid arguments.
> >
> > Sorry that is a bit of my frustration with Yunsheng coming through. He
> > has another patch set that mostly just moves my code and made himself
> > the maintainer. Admittedly I am a bit annoyed with that. Especially
> > since the main drive seems to be to force everything to use that one
> > approach and then optimize for his use case for vhost net over all
> > others most likely at the expense of everything else.
> >
> > It seems like it is the very thing we were complaining about in patch
> > 0 with other drivers getting penalized at the cost of optimizing for
> > one specific driver.
> >
> > > > allocate fragments, we want to allocate pages and fragment them
> > > > ourselves after the fact. As such it doesn't make much sense to add an
> > > > API that will have us trying to use the page fragment API which holds
> > > > onto the page when the expectation is that we will take the whole
> > > > thing and just fragment it ourselves.
> > >
> > > [...]
> > >
> > > Re "this HW works only on x86, why bother" -- I still believe there
> > > shouldn't be any hardcodes in any driver based on the fact that the HW
> > > is deployed only on particular systems. Page sizes, Endianness,
> > > 32/64-bit... It's not difficult to make a driver look like it's
> > > universal and could work anywhere, really.
> >
> > It isn't that this only works on x86. It is that we can only test it
> > on x86. The biggest issue right now is that I wouldn't have any
> > systems w/ 64K pages that I could test on.
>
> Didn't you write that you will provide QEMU emulation for this device?
>
> Thanks

Yes. I had already mentioned the possibility of testing it this way. I
am just not sure it adds much value to test the already limited
hardware and a limited platform setup in emulation. The issue is that
it will be hard to generate any stress in the QEMU environment since
it maxes out at only about 1 or 2 Gbps as the overhead for providing
TCAMs is software is not insignificant. To top it off, emulating a
non-native architecture will slow things down further. It would be
like asking someone to test a 100G nic on a system that only has PCIe
gen1.

I will probably just go the suggested route of enabling compile
testing on all platforms, and only support loading it on X86_64. As
time permits I can probably assign somebody the job of exploring the
page size larger than 4K issue, however it will be a matter of
weighing the trade-off for adding technical debt for a use case that
may not be applicable.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h
index 0819ddc1dcc8..f61b401fdd5c 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h
@@ -90,6 +90,66 @@  enum {
 
 #define FBNIC_TCD_DONE				DESC_BIT(63)
 
+/* Rx Completion Queue Descriptors */
+#define FBNIC_RCD_TYPE_MASK			DESC_GENMASK(62, 61)
+enum {
+	FBNIC_RCD_TYPE_HDR_AL	= 0,
+	FBNIC_RCD_TYPE_PAY_AL	= 1,
+	FBNIC_RCD_TYPE_OPT_META	= 2,
+	FBNIC_RCD_TYPE_META	= 3,
+};
+
+#define FBNIC_RCD_DONE				DESC_BIT(63)
+
+/* Address/Length Completion Descriptors */
+#define FBNIC_RCD_AL_BUFF_ID_MASK		DESC_GENMASK(15, 0)
+#define FBNIC_RCD_AL_BUFF_LEN_MASK		DESC_GENMASK(28, 16)
+#define FBNIC_RCD_AL_BUFF_OFF_MASK		DESC_GENMASK(43, 32)
+#define FBNIC_RCD_AL_PAGE_FIN			DESC_BIT(60)
+
+/* Header AL specific values */
+#define FBNIC_RCD_HDR_AL_OVERFLOW		DESC_BIT(53)
+#define FBNIC_RCD_HDR_AL_DMA_HINT_MASK		DESC_GENMASK(59, 54)
+enum {
+	FBNIC_RCD_HDR_AL_DMA_HINT_NONE  = 0,
+	FBNIC_RCD_HDR_AL_DMA_HINT_L2	= 1,
+	FBNIC_RCD_HDR_AL_DMA_HINT_L3	= 2,
+	FBNIC_RCD_HDR_AL_DMA_HINT_L4	= 4,
+};
+
+/* Optional Metadata Completion Descriptors */
+#define FBNIC_RCD_OPT_META_TS_MASK		DESC_GENMASK(39, 0)
+#define FBNIC_RCD_OPT_META_ACTION_MASK		DESC_GENMASK(45, 40)
+#define FBNIC_RCD_OPT_META_ACTION		DESC_BIT(57)
+#define FBNIC_RCD_OPT_META_TS			DESC_BIT(58)
+#define FBNIC_RCD_OPT_META_TYPE_MASK		DESC_GENMASK(60, 59)
+
+/* Metadata Completion Descriptors */
+#define FBNIC_RCD_META_RSS_HASH_MASK		DESC_GENMASK(31, 0)
+#define FBNIC_RCD_META_L2_CSUM_MASK		DESC_GENMASK(47, 32)
+#define FBNIC_RCD_META_L3_TYPE_MASK		DESC_GENMASK(49, 48)
+enum {
+	FBNIC_RCD_META_L3_TYPE_OTHER	= 0,
+	FBNIC_RCD_META_L3_TYPE_IPV4	= 1,
+	FBNIC_RCD_META_L3_TYPE_IPV6	= 2,
+	FBNIC_RCD_META_L3_TYPE_V6V6	= 3,
+};
+
+#define FBNIC_RCD_META_L4_TYPE_MASK		DESC_GENMASK(51, 50)
+enum {
+	FBNIC_RCD_META_L4_TYPE_OTHER	= 0,
+	FBNIC_RCD_META_L4_TYPE_TCP	= 1,
+	FBNIC_RCD_META_L4_TYPE_UDP	= 2,
+};
+
+#define FBNIC_RCD_META_L4_CSUM_UNNECESSARY	DESC_BIT(52)
+#define FBNIC_RCD_META_ERR_MAC_EOP		DESC_BIT(53)
+#define FBNIC_RCD_META_ERR_TRUNCATED_FRAME	DESC_BIT(54)
+#define FBNIC_RCD_META_ERR_PARSER		DESC_BIT(55)
+#define FBNIC_RCD_META_UNCORRECTABLE_ERR_MASK	\
+	(FBNIC_RCD_META_ERR_MAC_EOP | FBNIC_RCD_META_ERR_TRUNCATED_FRAME)
+#define FBNIC_RCD_META_ECN			DESC_BIT(60)
+
 /* Rx Buffer Descriptor Format */
 #define FBNIC_BD_PAGE_ADDR_MASK			DESC_GENMASK(45, 12)
 #define FBNIC_BD_PAGE_ID_MASK			DESC_GENMASK(63, 48)
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
index 91d4ea2bfb29..792bdfa7429d 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
@@ -150,8 +150,10 @@  struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd)
 	fbnic_reset_queues(fbn, default_queues, default_queues);
 
 	netdev->features |=
+		NETIF_F_RXHASH |
 		NETIF_F_SG |
-		NETIF_F_HW_CSUM;
+		NETIF_F_HW_CSUM |
+		NETIF_F_RXCSUM;
 
 	netdev->hw_features |= netdev->features;
 	netdev->vlan_features |= netdev->features;
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
index f243950c68bb..d897b0d65abf 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
@@ -268,6 +268,9 @@  static void fbnic_service_task(struct work_struct *work)
 
 	fbnic_health_check(fbd);
 
+	if (netif_carrier_ok(fbd->netdev))
+		fbnic_napi_depletion_check(fbd->netdev);
+
 	if (netif_running(fbd->netdev))
 		schedule_delayed_work(&fbd->service_task, HZ);
 
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
index ad4cb059c959..fe35112b9075 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
@@ -131,6 +131,24 @@  fbnic_tx_offloads(struct fbnic_ring *ring, struct sk_buff *skb, __le64 *meta)
 	return false;
 }
 
+static void
+fbnic_rx_csum(u64 rcd, struct sk_buff *skb, struct fbnic_ring *rcq)
+{
+	skb_checksum_none_assert(skb);
+
+	if (unlikely(!(skb->dev->features & NETIF_F_RXCSUM)))
+		return;
+
+	if (FIELD_GET(FBNIC_RCD_META_L4_CSUM_UNNECESSARY, rcd)) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else {
+		u16 csum = FIELD_GET(FBNIC_RCD_META_L2_CSUM_MASK, rcd);
+
+		skb->ip_summed = CHECKSUM_COMPLETE;
+		skb->csum = (__force __wsum)csum;
+	}
+}
+
 static bool
 fbnic_tx_map(struct fbnic_ring *ring, struct sk_buff *skb, __le64 *meta)
 {
@@ -364,6 +382,16 @@  static void fbnic_page_pool_init(struct fbnic_ring *ring, unsigned int idx,
 	rx_buf->page = page;
 }
 
+static struct page *fbnic_page_pool_get(struct fbnic_ring *ring,
+					unsigned int idx)
+{
+	struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx];
+
+	rx_buf->pagecnt_bias--;
+
+	return rx_buf->page;
+}
+
 static void fbnic_page_pool_drain(struct fbnic_ring *ring, unsigned int idx,
 				  struct fbnic_napi_vector *nv, int budget)
 {
@@ -501,6 +529,98 @@  static void fbnic_fill_bdq(struct fbnic_napi_vector *nv, struct fbnic_ring *bdq)
 	}
 }
 
+static unsigned int fbnic_hdr_pg_start(unsigned int pg_off)
+{
+	/* The headroom of the first header may be larger than FBNIC_RX_HROOM
+	 * due to alignment. So account for that by just making the page
+	 * offset 0 if we are starting at the first header.
+	 */
+	if (ALIGN(FBNIC_RX_HROOM, 128) > FBNIC_RX_HROOM &&
+	    pg_off == ALIGN(FBNIC_RX_HROOM, 128))
+		return 0;
+
+	return pg_off - FBNIC_RX_HROOM;
+}
+
+static unsigned int fbnic_hdr_pg_end(unsigned int pg_off, unsigned int len)
+{
+	/* Determine the end of the buffer by finding the start of the next
+	 * and then subtracting the headroom from that frame.
+	 */
+	pg_off += len + FBNIC_RX_TROOM + FBNIC_RX_HROOM;
+
+	return ALIGN(pg_off, 128) - FBNIC_RX_HROOM;
+}
+
+static void fbnic_pkt_prepare(struct fbnic_napi_vector *nv, u64 rcd,
+			      struct fbnic_pkt_buff *pkt,
+			      struct fbnic_q_triad *qt)
+{
+	unsigned int hdr_pg_off = FIELD_GET(FBNIC_RCD_AL_BUFF_OFF_MASK, rcd);
+	unsigned int hdr_pg_idx = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
+	struct page *page = fbnic_page_pool_get(&qt->sub0, hdr_pg_idx);
+	unsigned int len = FIELD_GET(FBNIC_RCD_AL_BUFF_LEN_MASK, rcd);
+	unsigned int frame_sz, hdr_pg_start, hdr_pg_end, headroom;
+	unsigned char *hdr_start;
+
+	/* data_hard_start should always be NULL when this is called */
+	WARN_ON_ONCE(pkt->buff.data_hard_start);
+
+	/* Short-cut the end caclulation if we know page is fully consumed */
+	hdr_pg_end = FIELD_GET(FBNIC_RCD_AL_PAGE_FIN, rcd) ?
+		     PAGE_SIZE : fbnic_hdr_pg_end(hdr_pg_off, len);
+	hdr_pg_start = fbnic_hdr_pg_start(hdr_pg_off);
+
+	frame_sz = hdr_pg_end - hdr_pg_start;
+	xdp_init_buff(&pkt->buff, frame_sz, NULL);
+
+	/* Sync DMA buffer */
+	dma_sync_single_range_for_cpu(nv->dev, page_pool_get_dma_addr(page),
+				      hdr_pg_start, frame_sz,
+				      DMA_BIDIRECTIONAL);
+
+	/* Build frame around buffer */
+	hdr_start = page_address(page) + hdr_pg_start;
+	headroom = hdr_pg_off - hdr_pg_start + FBNIC_RX_PAD;
+
+	xdp_prepare_buff(&pkt->buff, hdr_start, headroom,
+			 len - FBNIC_RX_PAD, true);
+
+	pkt->data_truesize = 0;
+	pkt->data_len = 0;
+	pkt->nr_frags = 0;
+}
+
+static void fbnic_add_rx_frag(struct fbnic_napi_vector *nv, u64 rcd,
+			      struct fbnic_pkt_buff *pkt,
+			      struct fbnic_q_triad *qt)
+{
+	unsigned int pg_off = FIELD_GET(FBNIC_RCD_AL_BUFF_OFF_MASK, rcd);
+	unsigned int pg_idx = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
+	unsigned int len = FIELD_GET(FBNIC_RCD_AL_BUFF_LEN_MASK, rcd);
+	struct page *page = fbnic_page_pool_get(&qt->sub1, pg_idx);
+	struct skb_shared_info *shinfo;
+	unsigned int truesize;
+
+	truesize = FIELD_GET(FBNIC_RCD_AL_PAGE_FIN, rcd) ? PAGE_SIZE - pg_off :
+		   ALIGN(len, 128);
+
+	/* Sync DMA buffer */
+	dma_sync_single_range_for_cpu(nv->dev, page_pool_get_dma_addr(page),
+				      pg_off, truesize, DMA_BIDIRECTIONAL);
+
+	/* Add page to xdp shared info */
+	shinfo = xdp_get_shared_info_from_buff(&pkt->buff);
+
+	/* We use gso_segs to store truesize */
+	pkt->data_truesize += truesize;
+
+	__skb_fill_page_desc_noacc(shinfo, pkt->nr_frags++, page, pg_off, len);
+
+	/* Store data_len in gso_size */
+	pkt->data_len += len;
+}
+
 static void fbnic_put_pkt_buff(struct fbnic_napi_vector *nv,
 			       struct fbnic_pkt_buff *pkt, int budget)
 {
@@ -522,7 +642,167 @@  static void fbnic_put_pkt_buff(struct fbnic_napi_vector *nv,
 	page = virt_to_page(pkt->buff.data_hard_start);
 	page_pool_put_full_page(nv->page_pool, page, !!budget);
 	pkt->buff.data_hard_start = NULL;
-};
+}
+
+static struct sk_buff *fbnic_build_skb(struct fbnic_napi_vector *nv,
+				       struct fbnic_pkt_buff *pkt)
+{
+	unsigned int nr_frags = pkt->nr_frags;
+	struct skb_shared_info *shinfo;
+	unsigned int truesize;
+	struct sk_buff *skb;
+
+	truesize = xdp_data_hard_end(&pkt->buff) + FBNIC_RX_TROOM -
+		   pkt->buff.data_hard_start;
+
+	/* Build frame around buffer */
+	skb = napi_build_skb(pkt->buff.data_hard_start, truesize);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* Push data pointer to start of data, put tail to end of data */
+	skb_reserve(skb, pkt->buff.data - pkt->buff.data_hard_start);
+	__skb_put(skb, pkt->buff.data_end - pkt->buff.data);
+
+	/* Add tracking for metadata at the start of the frame */
+	skb_metadata_set(skb, pkt->buff.data - pkt->buff.data_meta);
+
+	/* Add Rx frags */
+	if (nr_frags) {
+		/* Verify that shared info didn't move */
+		shinfo = xdp_get_shared_info_from_buff(&pkt->buff);
+		WARN_ON(skb_shinfo(skb) != shinfo);
+
+		skb->truesize += pkt->data_truesize;
+		skb->data_len += pkt->data_len;
+		shinfo->nr_frags = nr_frags;
+		skb->len += pkt->data_len;
+	}
+
+	skb_mark_for_recycle(skb);
+
+	/* Set MAC header specific fields */
+	skb->protocol = eth_type_trans(skb, nv->napi.dev);
+
+	return skb;
+}
+
+static enum pkt_hash_types fbnic_skb_hash_type(u64 rcd)
+{
+	return (FBNIC_RCD_META_L4_TYPE_MASK & rcd) ? PKT_HASH_TYPE_L4 :
+	       (FBNIC_RCD_META_L3_TYPE_MASK & rcd) ? PKT_HASH_TYPE_L3 :
+						     PKT_HASH_TYPE_L2;
+}
+
+static void fbnic_populate_skb_fields(struct fbnic_napi_vector *nv,
+				      u64 rcd, struct sk_buff *skb,
+				      struct fbnic_q_triad *qt)
+{
+	struct net_device *netdev = nv->napi.dev;
+	struct fbnic_ring *rcq = &qt->cmpl;
+
+	fbnic_rx_csum(rcd, skb, rcq);
+
+	if (netdev->features & NETIF_F_RXHASH)
+		skb_set_hash(skb,
+			     FIELD_GET(FBNIC_RCD_META_RSS_HASH_MASK, rcd),
+			     fbnic_skb_hash_type(rcd));
+
+	skb_record_rx_queue(skb, rcq->q_idx);
+}
+
+static bool fbnic_rcd_metadata_err(u64 rcd)
+{
+	return !!(FBNIC_RCD_META_UNCORRECTABLE_ERR_MASK & rcd);
+}
+
+static int fbnic_clean_rcq(struct fbnic_napi_vector *nv,
+			   struct fbnic_q_triad *qt, int budget)
+{
+	struct fbnic_ring *rcq = &qt->cmpl;
+	struct fbnic_pkt_buff *pkt;
+	s32 head0 = -1, head1 = -1;
+	__le64 *raw_rcd, done;
+	u32 head = rcq->head;
+	u64 packets = 0;
+
+	done = (head & (rcq->size_mask + 1)) ? cpu_to_le64(FBNIC_RCD_DONE) : 0;
+	raw_rcd = &rcq->desc[head & rcq->size_mask];
+	pkt = rcq->pkt;
+
+	/* Walk the completion queue collecting the heads reported by NIC */
+	while (likely(packets < budget)) {
+		struct sk_buff *skb = ERR_PTR(-EINVAL);
+		u64 rcd;
+
+		if ((*raw_rcd & cpu_to_le64(FBNIC_RCD_DONE)) == done)
+			break;
+
+		dma_rmb();
+
+		rcd = le64_to_cpu(*raw_rcd);
+
+		switch (FIELD_GET(FBNIC_RCD_TYPE_MASK, rcd)) {
+		case FBNIC_RCD_TYPE_HDR_AL:
+			head0 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
+			fbnic_pkt_prepare(nv, rcd, pkt, qt);
+
+			break;
+		case FBNIC_RCD_TYPE_PAY_AL:
+			head1 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd);
+			fbnic_add_rx_frag(nv, rcd, pkt, qt);
+
+			break;
+		case FBNIC_RCD_TYPE_OPT_META:
+			/* Only type 0 is currently supported */
+			if (FIELD_GET(FBNIC_RCD_OPT_META_TYPE_MASK, rcd))
+				break;
+
+			/* We currently ignore the action table index */
+			break;
+		case FBNIC_RCD_TYPE_META:
+			if (likely(!fbnic_rcd_metadata_err(rcd)))
+				skb = fbnic_build_skb(nv, pkt);
+
+			/* populate skb and invalidate XDP */
+			if (!IS_ERR_OR_NULL(skb)) {
+				fbnic_populate_skb_fields(nv, rcd, skb, qt);
+
+				packets++;
+
+				napi_gro_receive(&nv->napi, skb);
+			}
+
+			pkt->buff.data_hard_start = NULL;
+
+			break;
+		}
+
+		raw_rcd++;
+		head++;
+		if (!(head & rcq->size_mask)) {
+			done ^= cpu_to_le64(FBNIC_RCD_DONE);
+			raw_rcd = &rcq->desc[0];
+		}
+	}
+
+	/* Unmap and free processed buffers */
+	if (head0 >= 0)
+		fbnic_clean_bdq(nv, budget, &qt->sub0, head0);
+	fbnic_fill_bdq(nv, &qt->sub0);
+
+	if (head1 >= 0)
+		fbnic_clean_bdq(nv, budget, &qt->sub1, head1);
+	fbnic_fill_bdq(nv, &qt->sub1);
+
+	/* Record the current head/tail of the queue */
+	if (rcq->head != head) {
+		rcq->head = head;
+		writel(head & rcq->size_mask, rcq->doorbell);
+	}
+
+	return packets;
+}
 
 static void fbnic_nv_irq_disable(struct fbnic_napi_vector *nv)
 {
@@ -545,12 +825,19 @@  static int fbnic_poll(struct napi_struct *napi, int budget)
 	struct fbnic_napi_vector *nv = container_of(napi,
 						    struct fbnic_napi_vector,
 						    napi);
-	int i;
+	int i, j, work_done = 0;
 
 	for (i = 0; i < nv->txt_count; i++)
 		fbnic_clean_tcq(nv, &nv->qt[i], budget);
 
-	if (likely(napi_complete_done(napi, 0)))
+	if (budget)
+		for (j = 0; j < nv->rxt_count; j++, i++)
+			work_done += fbnic_clean_rcq(nv, &nv->qt[i], budget);
+
+	if (work_done >= budget)
+		return budget;
+
+	if (likely(napi_complete_done(napi, work_done)))
 		fbnic_nv_irq_rearm(nv);
 
 	return 0;
@@ -1555,3 +1842,32 @@  void fbnic_napi_enable(struct fbnic_net *fbn)
 	}
 	wrfl();
 }
+
+void fbnic_napi_depletion_check(struct net_device *netdev)
+{
+	struct fbnic_net *fbn = netdev_priv(netdev);
+	u32 irqs[FBNIC_MAX_MSIX_VECS / 32] = {};
+	struct fbnic_dev *fbd = fbn->fbd;
+	struct fbnic_napi_vector *nv;
+	int i, j;
+
+	list_for_each_entry(nv, &fbn->napis, napis) {
+		/* Find RQs which are completely out of pages */
+		for (i = nv->txt_count, j = 0; j < nv->rxt_count; j++, i++) {
+			/* Assume 4 pages is always enough to fit a packet
+			 * and therefore generate a completion and an IRQ.
+			 */
+			if (fbnic_desc_used(&nv->qt[i].sub0) < 4 ||
+			    fbnic_desc_used(&nv->qt[i].sub1) < 4)
+				irqs[nv->v_idx / 32] |= BIT(nv->v_idx % 32);
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(irqs); i++) {
+		if (!irqs[i])
+			continue;
+		wr32(FBNIC_INTR_MASK_CLEAR(i), irqs[i]);
+		wr32(FBNIC_INTR_SET(i), irqs[i]);
+	}
+	wrfl();
+}
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h
index 0c424c49866d..4e43d41c781a 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h
@@ -5,6 +5,7 @@ 
 #define _FBNIC_TXRX_H_
 
 #include <linux/netdevice.h>
+#include <linux/skbuff.h>
 #include <linux/types.h>
 #include <net/xdp.h>
 
@@ -118,6 +119,7 @@  void fbnic_config_drop_mode(struct fbnic_net *fbn);
 void fbnic_flush(struct fbnic_net *fbn);
 void fbnic_fill(struct fbnic_net *fbn);
 
+void fbnic_napi_depletion_check(struct net_device *netdev);
 int fbnic_wait_all_queues_idle(struct fbnic_dev *fbd, bool may_fail);
 
 #endif /* _FBNIC_TXRX_H_ */