Message ID | 171217496013.1598374.10126180029382922588.stgit@ahduyck-xeon-server.home.arpa (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | eth: fbnic: Add network driver for Meta Platforms Host Network Interface | expand |
On 2024/4/4 4:09, Alexander Duyck wrote: > From: Alexander Duyck <alexanderduyck@fb.com> ... > + > +static int fbnic_clean_rcq(struct fbnic_napi_vector *nv, > + struct fbnic_q_triad *qt, int budget) > +{ > + struct fbnic_ring *rcq = &qt->cmpl; > + struct fbnic_pkt_buff *pkt; > + s32 head0 = -1, head1 = -1; > + __le64 *raw_rcd, done; > + u32 head = rcq->head; > + u64 packets = 0; > + > + done = (head & (rcq->size_mask + 1)) ? cpu_to_le64(FBNIC_RCD_DONE) : 0; > + raw_rcd = &rcq->desc[head & rcq->size_mask]; > + pkt = rcq->pkt; > + > + /* Walk the completion queue collecting the heads reported by NIC */ > + while (likely(packets < budget)) { > + struct sk_buff *skb = ERR_PTR(-EINVAL); > + u64 rcd; > + > + if ((*raw_rcd & cpu_to_le64(FBNIC_RCD_DONE)) == done) > + break; > + > + dma_rmb(); > + > + rcd = le64_to_cpu(*raw_rcd); > + > + switch (FIELD_GET(FBNIC_RCD_TYPE_MASK, rcd)) { > + case FBNIC_RCD_TYPE_HDR_AL: > + head0 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); > + fbnic_pkt_prepare(nv, rcd, pkt, qt); > + > + break; > + case FBNIC_RCD_TYPE_PAY_AL: > + head1 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); > + fbnic_add_rx_frag(nv, rcd, pkt, qt); > + > + break; > + case FBNIC_RCD_TYPE_OPT_META: > + /* Only type 0 is currently supported */ > + if (FIELD_GET(FBNIC_RCD_OPT_META_TYPE_MASK, rcd)) > + break; > + > + /* We currently ignore the action table index */ > + break; > + case FBNIC_RCD_TYPE_META: > + if (likely(!fbnic_rcd_metadata_err(rcd))) > + skb = fbnic_build_skb(nv, pkt); > + > + /* populate skb and invalidate XDP */ > + if (!IS_ERR_OR_NULL(skb)) { > + fbnic_populate_skb_fields(nv, rcd, skb, qt); > + > + packets++; > + > + napi_gro_receive(&nv->napi, skb); > + } > + > + pkt->buff.data_hard_start = NULL; > + > + break; > + } > + > + raw_rcd++; > + head++; > + if (!(head & rcq->size_mask)) { > + done ^= cpu_to_le64(FBNIC_RCD_DONE); > + raw_rcd = &rcq->desc[0]; > + } > + } > + > + /* Unmap and free processed buffers */ > + if (head0 >= 0) > + fbnic_clean_bdq(nv, budget, &qt->sub0, head0); > + fbnic_fill_bdq(nv, &qt->sub0); > + > + if (head1 >= 0) > + fbnic_clean_bdq(nv, budget, &qt->sub1, head1); > + fbnic_fill_bdq(nv, &qt->sub1); I am not sure how complicated the rx handling will be for the advanced feature. For the current code, for each entry/desc in both qt->sub0 and qt->sub1 at least need one page, and the page seems to be only used once no matter however small the page is used? I am assuming you want to do 'tightly optimized' operation for this by calling page_pool_fragment_page(), but manipulating page->pp_ref_count directly does not seems to add any value for the current code, but seem to waste a lot of memory by not using the frag API, especially PAGE_SIZE > 4K? > + > + /* Record the current head/tail of the queue */ > + if (rcq->head != head) { > + rcq->head = head; > + writel(head & rcq->size_mask, rcq->doorbell); > + } > + > + return packets; > +} > >
On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2024/4/4 4:09, Alexander Duyck wrote: > > From: Alexander Duyck <alexanderduyck@fb.com> [...] > > + /* Unmap and free processed buffers */ > > + if (head0 >= 0) > > + fbnic_clean_bdq(nv, budget, &qt->sub0, head0); > > + fbnic_fill_bdq(nv, &qt->sub0); > > + > > + if (head1 >= 0) > > + fbnic_clean_bdq(nv, budget, &qt->sub1, head1); > > + fbnic_fill_bdq(nv, &qt->sub1); > > I am not sure how complicated the rx handling will be for the advanced > feature. For the current code, for each entry/desc in both qt->sub0 and > qt->sub1 at least need one page, and the page seems to be only used once > no matter however small the page is used? > > I am assuming you want to do 'tightly optimized' operation for this by > calling page_pool_fragment_page(), but manipulating page->pp_ref_count > directly does not seems to add any value for the current code, but seem > to waste a lot of memory by not using the frag API, especially PAGE_SIZE > > 4K? On this hardware both the header and payload buffers are fragmentable. The hardware decides the partitioning and we just follow it. So for example it wouldn't be uncommon to have a jumbo frame split up such that the header is less than 128B plus SKB overhead while the actual data in the payload is just over 1400. So for us fragmenting the pages is a very likely case especially with smaller packets. It is better for us to optimize for the small packet scenario than optimize for the case where 4K slices are getting taken. That way when we are CPU constrained handling small packets we are the most optimized whereas for the larger frames we can spare a few cycles to account for the extra overhead. The result should be a higher overall packets per second.
On 2024/4/9 23:08, Alexander Duyck wrote: > On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: >> >> On 2024/4/4 4:09, Alexander Duyck wrote: >>> From: Alexander Duyck <alexanderduyck@fb.com> > > [...] > >>> + /* Unmap and free processed buffers */ >>> + if (head0 >= 0) >>> + fbnic_clean_bdq(nv, budget, &qt->sub0, head0); >>> + fbnic_fill_bdq(nv, &qt->sub0); >>> + >>> + if (head1 >= 0) >>> + fbnic_clean_bdq(nv, budget, &qt->sub1, head1); >>> + fbnic_fill_bdq(nv, &qt->sub1); >> >> I am not sure how complicated the rx handling will be for the advanced >> feature. For the current code, for each entry/desc in both qt->sub0 and >> qt->sub1 at least need one page, and the page seems to be only used once >> no matter however small the page is used? >> >> I am assuming you want to do 'tightly optimized' operation for this by >> calling page_pool_fragment_page(), but manipulating page->pp_ref_count >> directly does not seems to add any value for the current code, but seem >> to waste a lot of memory by not using the frag API, especially PAGE_SIZE >>> 4K? > > On this hardware both the header and payload buffers are fragmentable. > The hardware decides the partitioning and we just follow it. So for > example it wouldn't be uncommon to have a jumbo frame split up such > that the header is less than 128B plus SKB overhead while the actual > data in the payload is just over 1400. So for us fragmenting the pages > is a very likely case especially with smaller packets. I understand that is what you are trying to do, but the code above does not seems to match the description, as the fbnic_clean_bdq() and fbnic_fill_bdq() are called for qt->sub0 and qt->sub1, so the old pages of qt->sub0 and qt->sub1 just cleaned are drained and refill each sub with new pages, which does not seems to have any fragmenting? The fragmenting can only happen when there is continuous small packet coming from wire so that hw can report the same pg_id for different packet with pg_offset before fbnic_clean_bdq() and fbnic_fill_bdq() is called? I am not sure how to ensure that considering that we might break out of while loop in fbnic_clean_rcq() because of 'packets < budget' checking. > > It is better for us to optimize for the small packet scenario than > optimize for the case where 4K slices are getting taken. That way when > we are CPU constrained handling small packets we are the most > optimized whereas for the larger frames we can spare a few cycles to > account for the extra overhead. The result should be a higher overall > packets per second. The problem is that small packet means low utilization of the bandwidth as more bandwidth is used to send header instead of payload that is useful for the user, so the question seems to be how often the small packet is seen in the wire? > . >
On Wed, Apr 10, 2024 at 4:54 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2024/4/9 23:08, Alexander Duyck wrote: > > On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > >> > >> On 2024/4/4 4:09, Alexander Duyck wrote: > >>> From: Alexander Duyck <alexanderduyck@fb.com> > > > > [...] > > > >>> + /* Unmap and free processed buffers */ > >>> + if (head0 >= 0) > >>> + fbnic_clean_bdq(nv, budget, &qt->sub0, head0); > >>> + fbnic_fill_bdq(nv, &qt->sub0); > >>> + > >>> + if (head1 >= 0) > >>> + fbnic_clean_bdq(nv, budget, &qt->sub1, head1); > >>> + fbnic_fill_bdq(nv, &qt->sub1); > >> > >> I am not sure how complicated the rx handling will be for the advanced > >> feature. For the current code, for each entry/desc in both qt->sub0 and > >> qt->sub1 at least need one page, and the page seems to be only used once > >> no matter however small the page is used? > >> > >> I am assuming you want to do 'tightly optimized' operation for this by > >> calling page_pool_fragment_page(), but manipulating page->pp_ref_count > >> directly does not seems to add any value for the current code, but seem > >> to waste a lot of memory by not using the frag API, especially PAGE_SIZE > >>> 4K? > > > > On this hardware both the header and payload buffers are fragmentable. > > The hardware decides the partitioning and we just follow it. So for > > example it wouldn't be uncommon to have a jumbo frame split up such > > that the header is less than 128B plus SKB overhead while the actual > > data in the payload is just over 1400. So for us fragmenting the pages > > is a very likely case especially with smaller packets. > > I understand that is what you are trying to do, but the code above does > not seems to match the description, as the fbnic_clean_bdq() and > fbnic_fill_bdq() are called for qt->sub0 and qt->sub1, so the old pages > of qt->sub0 and qt->sub1 just cleaned are drained and refill each sub > with new pages, which does not seems to have any fragmenting? That is because it is all taken care of by the completion queue. Take a look in fbnic_pkt_prepare. We are taking the buffer from the header descriptor and taking a slice out of it there via fbnic_page_pool_get. Basically we store the fragment count locally in the rx_buf and then subtract what is leftover when the device is done with it. > The fragmenting can only happen when there is continuous small packet > coming from wire so that hw can report the same pg_id for different > packet with pg_offset before fbnic_clean_bdq() and fbnic_fill_bdq() > is called? I am not sure how to ensure that considering that we might > break out of while loop in fbnic_clean_rcq() because of 'packets < budget' > checking. We don't free the page until we have moved one past it, or the hardware has indicated it will take no more slices via a PAGE_FIN bit in the descriptor. > > It is better for us to optimize for the small packet scenario than > > optimize for the case where 4K slices are getting taken. That way when > > we are CPU constrained handling small packets we are the most > > optimized whereas for the larger frames we can spare a few cycles to > > account for the extra overhead. The result should be a higher overall > > packets per second. > > The problem is that small packet means low utilization of the bandwidth > as more bandwidth is used to send header instead of payload that is useful > for the user, so the question seems to be how often the small packet is > seen in the wire? Very often. Especially when you are running something like servers where the flow usually consists of an incoming request which is often only a few hundred bytes, followed by us sending a response which then leads to a flow of control frames for it.
On 2024/4/10 23:03, Alexander Duyck wrote: > On Wed, Apr 10, 2024 at 4:54 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: >> >> On 2024/4/9 23:08, Alexander Duyck wrote: >>> On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: >>>> >>>> On 2024/4/4 4:09, Alexander Duyck wrote: >>>>> From: Alexander Duyck <alexanderduyck@fb.com> >>> >>> [...] >>> >>>>> + /* Unmap and free processed buffers */ >>>>> + if (head0 >= 0) >>>>> + fbnic_clean_bdq(nv, budget, &qt->sub0, head0); >>>>> + fbnic_fill_bdq(nv, &qt->sub0); >>>>> + >>>>> + if (head1 >= 0) >>>>> + fbnic_clean_bdq(nv, budget, &qt->sub1, head1); >>>>> + fbnic_fill_bdq(nv, &qt->sub1); >>>> >>>> I am not sure how complicated the rx handling will be for the advanced >>>> feature. For the current code, for each entry/desc in both qt->sub0 and >>>> qt->sub1 at least need one page, and the page seems to be only used once >>>> no matter however small the page is used? >>>> >>>> I am assuming you want to do 'tightly optimized' operation for this by >>>> calling page_pool_fragment_page(), but manipulating page->pp_ref_count >>>> directly does not seems to add any value for the current code, but seem >>>> to waste a lot of memory by not using the frag API, especially PAGE_SIZE >>>>> 4K? >>> >>> On this hardware both the header and payload buffers are fragmentable. >>> The hardware decides the partitioning and we just follow it. So for >>> example it wouldn't be uncommon to have a jumbo frame split up such >>> that the header is less than 128B plus SKB overhead while the actual >>> data in the payload is just over 1400. So for us fragmenting the pages >>> is a very likely case especially with smaller packets. >> >> I understand that is what you are trying to do, but the code above does >> not seems to match the description, as the fbnic_clean_bdq() and >> fbnic_fill_bdq() are called for qt->sub0 and qt->sub1, so the old pages >> of qt->sub0 and qt->sub1 just cleaned are drained and refill each sub >> with new pages, which does not seems to have any fragmenting? > > That is because it is all taken care of by the completion queue. Take > a look in fbnic_pkt_prepare. We are taking the buffer from the header > descriptor and taking a slice out of it there via fbnic_page_pool_get. > Basically we store the fragment count locally in the rx_buf and then > subtract what is leftover when the device is done with it. The above seems look a lot like the prepare/commit API in [1], the prepare is done in fbnic_fill_bdq() and commit is done by fbnic_page_pool_get() in fbnic_pkt_prepare() and fbnic_add_rx_frag(). If page_pool is able to provide a central place for pagecnt_bias of all the fragmemts of the same page, we may provide a similar prepare/commit API for frag API, I am not sure how to handle it for now. From the below macro, this hw seems to be only able to handle 4K memory for each entry/desc in qt->sub0 and qt->sub1, so there seems to be a lot of memory that is unused for PAGE_SIZE > 4K as it is allocating memory based on page granularity for each rx_buf in qt->sub0 and qt->sub1. +#define FBNIC_RCD_AL_BUFF_OFF_MASK DESC_GENMASK(43, 32) It is still possible to reserve enough pagecnt_bias for each fragment, so that the caller can still do its own fragmenting on fragment granularity as we seems to have enough pagecnt_bias for each page. If we provide a proper frag API to reserve enough pagecnt_bias for caller to do its own fragmenting, then the memory waste may be avoided for this hw in system with PAGE_SIZE > 4K. 1. https://lore.kernel.org/lkml/20240407130850.19625-10-linyunsheng@huawei.com/ > >> The fragmenting can only happen when there is continuous small packet >> coming from wire so that hw can report the same pg_id for different >> packet with pg_offset before fbnic_clean_bdq() and fbnic_fill_bdq() >> is called? I am not sure how to ensure that considering that we might >> break out of while loop in fbnic_clean_rcq() because of 'packets < budget' >> checking. > > We don't free the page until we have moved one past it, or the > hardware has indicated it will take no more slices via a PAGE_FIN bit > in the descriptor. I look more closely at it, I am not able to figure it out how it is done yet, as the PAGE_FIN bit mentioned above seems to be only used to calculate the hdr_pg_end and truesize in fbnic_pkt_prepare() and fbnic_add_rx_frag(). For the below flow in fbnic_clean_rcq(), fbnic_clean_bdq() will be called to drain the page in rx_buf just cleaned when head0/head1 >= 0, so I am not sure how it do the fragmenting yet, am I missing something obvious here? while (likely(packets < budget)) { switch (FIELD_GET(FBNIC_RCD_TYPE_MASK, rcd)) { case FBNIC_RCD_TYPE_HDR_AL: head0 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); fbnic_pkt_prepare(nv, rcd, pkt, qt); break; case FBNIC_RCD_TYPE_PAY_AL: head1 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); fbnic_add_rx_frag(nv, rcd, pkt, qt); break; case FBNIC_RCD_TYPE_META: if (likely(!fbnic_rcd_metadata_err(rcd))) skb = fbnic_build_skb(nv, pkt); /* populate skb and invalidate XDP */ if (!IS_ERR_OR_NULL(skb)) { fbnic_populate_skb_fields(nv, rcd, skb, qt); packets++; napi_gro_receive(&nv->napi, skb); } pkt->buff.data_hard_start = NULL; break; } /* Unmap and free processed buffers */ if (head0 >= 0) fbnic_clean_bdq(nv, budget, &qt->sub0, head0); fbnic_fill_bdq(nv, &qt->sub0); if (head1 >= 0) fbnic_clean_bdq(nv, budget, &qt->sub1, head1); fbnic_fill_bdq(nv, &qt->sub1); } > >>> It is better for us to optimize for the small packet scenario than >>> optimize for the case where 4K slices are getting taken. That way when >>> we are CPU constrained handling small packets we are the most >>> optimized whereas for the larger frames we can spare a few cycles to >>> account for the extra overhead. The result should be a higher overall >>> packets per second. >> >> The problem is that small packet means low utilization of the bandwidth >> as more bandwidth is used to send header instead of payload that is useful >> for the user, so the question seems to be how often the small packet is >> seen in the wire? > > Very often. Especially when you are running something like servers > where the flow usually consists of an incoming request which is often > only a few hundred bytes, followed by us sending a response which then > leads to a flow of control frames for it. I think this is depending on the use case, if it is video streaming server, I guess most of the packet is mtu-sized? > . >
On 2024/4/12 16:43, Yunsheng Lin wrote: > On 2024/4/10 23:03, Alexander Duyck wrote: > > If we provide a proper frag API to reserve enough pagecnt_bias for caller to > do its own fragmenting, then the memory waste may be avoided for this hw in > system with PAGE_SIZE > 4K. > Something like the page_pool_alloc_frag_bias() API below: diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 1d397c1a0043..018943307e68 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -92,6 +92,14 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) return page_pool_alloc_pages(pool, gfp); } +static inline struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, + gfp_t gfp) +{ + return page_pool_alloc_frag_bias(pool, offset, size, 1U, gfp); +} + /** * page_pool_dev_alloc_frag() - allocate a page fragment. * @pool: pool from which to allocate diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 5e43a08d3231..2847b96264e5 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -202,8 +202,9 @@ struct page_pool { }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); -struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, - unsigned int size, gfp_t gfp); +struct page *page_pool_alloc_frag_bias(struct page_pool *pool, + unsigned int *offset, unsigned int size, + unsigned int bias, gfp_t gfp); struct page_pool *page_pool_create(const struct page_pool_params *params); struct page_pool *page_pool_create_percpu(const struct page_pool_params *params, int cpuid); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 4c175091fc0a..441b54473c35 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -850,9 +850,11 @@ static void page_pool_free_frag(struct page_pool *pool) page_pool_return_page(pool, page); } -struct page *page_pool_alloc_frag(struct page_pool *pool, - unsigned int *offset, - unsigned int size, gfp_t gfp) +struct page *page_pool_alloc_frag_bias(struct page_pool *pool, + unsigned int *offset, + unsigned int size, + unsigned int bias, + gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; struct page *page = pool->frag_page; @@ -881,19 +883,19 @@ struct page *page_pool_alloc_frag(struct page_pool *pool, pool->frag_page = page; frag_reset: - pool->frag_users = 1; + pool->frag_users = bias; *offset = 0; pool->frag_offset = size; page_pool_fragment_page(page, BIAS_MAX); return page; } - pool->frag_users++; + pool->frag_users += bias; pool->frag_offset = *offset + size; alloc_stat_inc(pool, fast); return page; } -EXPORT_SYMBOL(page_pool_alloc_frag); +EXPORT_SYMBOL(page_pool_alloc_frag_bias); static void page_pool_empty_ring(struct page_pool *pool) {
On Fri, Apr 12, 2024 at 1:43 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2024/4/10 23:03, Alexander Duyck wrote: > > On Wed, Apr 10, 2024 at 4:54 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > >> > >> On 2024/4/9 23:08, Alexander Duyck wrote: > >>> On Tue, Apr 9, 2024 at 4:47 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > >>>> > >>>> On 2024/4/4 4:09, Alexander Duyck wrote: > >>>>> From: Alexander Duyck <alexanderduyck@fb.com> > >>> > >>> [...] > >>> > >>>>> + /* Unmap and free processed buffers */ > >>>>> + if (head0 >= 0) > >>>>> + fbnic_clean_bdq(nv, budget, &qt->sub0, head0); > >>>>> + fbnic_fill_bdq(nv, &qt->sub0); > >>>>> + > >>>>> + if (head1 >= 0) > >>>>> + fbnic_clean_bdq(nv, budget, &qt->sub1, head1); > >>>>> + fbnic_fill_bdq(nv, &qt->sub1); > >>>> > >>>> I am not sure how complicated the rx handling will be for the advanced > >>>> feature. For the current code, for each entry/desc in both qt->sub0 and > >>>> qt->sub1 at least need one page, and the page seems to be only used once > >>>> no matter however small the page is used? > >>>> > >>>> I am assuming you want to do 'tightly optimized' operation for this by > >>>> calling page_pool_fragment_page(), but manipulating page->pp_ref_count > >>>> directly does not seems to add any value for the current code, but seem > >>>> to waste a lot of memory by not using the frag API, especially PAGE_SIZE > >>>>> 4K? > >>> > >>> On this hardware both the header and payload buffers are fragmentable. > >>> The hardware decides the partitioning and we just follow it. So for > >>> example it wouldn't be uncommon to have a jumbo frame split up such > >>> that the header is less than 128B plus SKB overhead while the actual > >>> data in the payload is just over 1400. So for us fragmenting the pages > >>> is a very likely case especially with smaller packets. > >> > >> I understand that is what you are trying to do, but the code above does > >> not seems to match the description, as the fbnic_clean_bdq() and > >> fbnic_fill_bdq() are called for qt->sub0 and qt->sub1, so the old pages > >> of qt->sub0 and qt->sub1 just cleaned are drained and refill each sub > >> with new pages, which does not seems to have any fragmenting? > > > > That is because it is all taken care of by the completion queue. Take > > a look in fbnic_pkt_prepare. We are taking the buffer from the header > > descriptor and taking a slice out of it there via fbnic_page_pool_get. > > Basically we store the fragment count locally in the rx_buf and then > > subtract what is leftover when the device is done with it. > > The above seems look a lot like the prepare/commit API in [1], the prepare > is done in fbnic_fill_bdq() and commit is done by fbnic_page_pool_get() in > fbnic_pkt_prepare() and fbnic_add_rx_frag(). > > If page_pool is able to provide a central place for pagecnt_bias of all the > fragmemts of the same page, we may provide a similar prepare/commit API for > frag API, I am not sure how to handle it for now. > > From the below macro, this hw seems to be only able to handle 4K memory for > each entry/desc in qt->sub0 and qt->sub1, so there seems to be a lot of memory > that is unused for PAGE_SIZE > 4K as it is allocating memory based on page > granularity for each rx_buf in qt->sub0 and qt->sub1. > > +#define FBNIC_RCD_AL_BUFF_OFF_MASK DESC_GENMASK(43, 32) The advantage of being a purpose built driver is that we aren't running on any architectures where the PAGE_SIZE > 4K. If it came to that we could probably look at splitting the pages within the descriptors by simply having a single page span multiple descriptors. > It is still possible to reserve enough pagecnt_bias for each fragment, so that > the caller can still do its own fragmenting on fragment granularity as we > seems to have enough pagecnt_bias for each page. > > If we provide a proper frag API to reserve enough pagecnt_bias for caller to > do its own fragmenting, then the memory waste may be avoided for this hw in > system with PAGE_SIZE > 4K. > > 1. https://lore.kernel.org/lkml/20240407130850.19625-10-linyunsheng@huawei.com/ That isn't a concern for us as we are only using the device on x86 systems at this time. > > > >> The fragmenting can only happen when there is continuous small packet > >> coming from wire so that hw can report the same pg_id for different > >> packet with pg_offset before fbnic_clean_bdq() and fbnic_fill_bdq() > >> is called? I am not sure how to ensure that considering that we might > >> break out of while loop in fbnic_clean_rcq() because of 'packets < budget' > >> checking. > > > > We don't free the page until we have moved one past it, or the > > hardware has indicated it will take no more slices via a PAGE_FIN bit > > in the descriptor. > > > I look more closely at it, I am not able to figure it out how it is done > yet, as the PAGE_FIN bit mentioned above seems to be only used to calculate > the hdr_pg_end and truesize in fbnic_pkt_prepare() and fbnic_add_rx_frag(). > > For the below flow in fbnic_clean_rcq(), fbnic_clean_bdq() will be called > to drain the page in rx_buf just cleaned when head0/head1 >= 0, so I am not > sure how it do the fragmenting yet, am I missing something obvious here? > > while (likely(packets < budget)) { > switch (FIELD_GET(FBNIC_RCD_TYPE_MASK, rcd)) { > case FBNIC_RCD_TYPE_HDR_AL: > head0 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); > fbnic_pkt_prepare(nv, rcd, pkt, qt); > > break; > case FBNIC_RCD_TYPE_PAY_AL: > head1 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); > fbnic_add_rx_frag(nv, rcd, pkt, qt); > > break; > > case FBNIC_RCD_TYPE_META: > if (likely(!fbnic_rcd_metadata_err(rcd))) > skb = fbnic_build_skb(nv, pkt); > > /* populate skb and invalidate XDP */ > if (!IS_ERR_OR_NULL(skb)) { > fbnic_populate_skb_fields(nv, rcd, skb, qt); > > packets++; > > napi_gro_receive(&nv->napi, skb); > } > > pkt->buff.data_hard_start = NULL; > > break; > } > > /* Unmap and free processed buffers */ > if (head0 >= 0) > fbnic_clean_bdq(nv, budget, &qt->sub0, head0); > fbnic_fill_bdq(nv, &qt->sub0); > > if (head1 >= 0) > fbnic_clean_bdq(nv, budget, &qt->sub1, head1); > fbnic_fill_bdq(nv, &qt->sub1); > > } The cleanup logic cleans everything up to but not including the head0/head1 offsets. So the pages are left on the ring until they are fully consumed. > > > >>> It is better for us to optimize for the small packet scenario than > >>> optimize for the case where 4K slices are getting taken. That way when > >>> we are CPU constrained handling small packets we are the most > >>> optimized whereas for the larger frames we can spare a few cycles to > >>> account for the extra overhead. The result should be a higher overall > >>> packets per second. > >> > >> The problem is that small packet means low utilization of the bandwidth > >> as more bandwidth is used to send header instead of payload that is useful > >> for the user, so the question seems to be how often the small packet is > >> seen in the wire? > > > > Very often. Especially when you are running something like servers > > where the flow usually consists of an incoming request which is often > > only a few hundred bytes, followed by us sending a response which then > > leads to a flow of control frames for it. > > I think this is depending on the use case, if it is video streaming server, > I guess most of the packet is mtu-sized? For the transmit side, yes. For the server side no. A typical TCP flow has two sides two it. One sending SYN/ACK/FIN requests and the initial get request and the other basically sending the response data.
On 2024/4/12 23:05, Alexander Duyck wrote: ... >> >> From the below macro, this hw seems to be only able to handle 4K memory for >> each entry/desc in qt->sub0 and qt->sub1, so there seems to be a lot of memory >> that is unused for PAGE_SIZE > 4K as it is allocating memory based on page >> granularity for each rx_buf in qt->sub0 and qt->sub1. >> >> +#define FBNIC_RCD_AL_BUFF_OFF_MASK DESC_GENMASK(43, 32) > > The advantage of being a purpose built driver is that we aren't > running on any architectures where the PAGE_SIZE > 4K. If it came to I am not sure if 'being a purpose built driver' argument is strong enough here, at least the Kconfig does not seems to be suggesting it is a purpose built driver, perhaps add a 'depend on' to suggest that? > that we could probably look at splitting the pages within the > descriptors by simply having a single page span multiple descriptors. My point is that we might be able to meet the above use case with a proper API without driver manipulating the reference counting by calling page_pool_fragment_page() directly.
On Mon, Apr 15, 2024 at 6:19 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2024/4/12 23:05, Alexander Duyck wrote: > > ... > > >> > >> From the below macro, this hw seems to be only able to handle 4K memory for > >> each entry/desc in qt->sub0 and qt->sub1, so there seems to be a lot of memory > >> that is unused for PAGE_SIZE > 4K as it is allocating memory based on page > >> granularity for each rx_buf in qt->sub0 and qt->sub1. > >> > >> +#define FBNIC_RCD_AL_BUFF_OFF_MASK DESC_GENMASK(43, 32) > > > > The advantage of being a purpose built driver is that we aren't > > running on any architectures where the PAGE_SIZE > 4K. If it came to > > I am not sure if 'being a purpose built driver' argument is strong enough > here, at least the Kconfig does not seems to be suggesting it is a purpose > built driver, perhaps add a 'depend on' to suggest that? I'm not sure if you have been following the other threads. One of the general thoughts of pushback against this driver was that Meta is currently the only company that will have possession of this NIC. As such Meta will be deciding what systems it goes into and as a result of that we aren't likely to be running it on systems with 64K pages. > > that we could probably look at splitting the pages within the > > descriptors by simply having a single page span multiple descriptors. > > My point is that we might be able to meet the above use case with a proper > API without driver manipulating the reference counting by calling > page_pool_fragment_page() directly. My suggestion would be to look at putting your proposed API together as something that can be used by another driver. Once we hit that I can then look at incorporating it into fbnic. One issue right now is that the current patch set is meant to make use of existing APIs instead of needing to rely on creating new ones as this isn't a device others will have access to so it will make it harder to test any proposed API based only on fbnic.
On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote: > > > The advantage of being a purpose built driver is that we aren't > > > running on any architectures where the PAGE_SIZE > 4K. If it came to > > > > I am not sure if 'being a purpose built driver' argument is strong enough > > here, at least the Kconfig does not seems to be suggesting it is a purpose > > built driver, perhaps add a 'depend on' to suggest that? > > I'm not sure if you have been following the other threads. One of the > general thoughts of pushback against this driver was that Meta is > currently the only company that will have possession of this NIC. As > such Meta will be deciding what systems it goes into and as a result > of that we aren't likely to be running it on systems with 64K pages. Didn't take long for this argument to float to the surface.. We tried to write some rules with Paolo but haven't published them, yet. Here is one that may be relevant: 3. External contributions ------------------------- Owners of drivers for private devices must not exhibit a stronger sense of ownership or push back on accepting code changes from members of the community. 3rd party contributions should be evaluated and eventually accepted, or challenged only on technical arguments based on the code itself. In particular, the argument that the owner is the only user and therefore knows best should not be used. Not exactly a contribution, but we predicted the "we know best" tone of the argument :(
On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote: > > On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote: > > > > The advantage of being a purpose built driver is that we aren't > > > > running on any architectures where the PAGE_SIZE > 4K. If it came to > > > > > > I am not sure if 'being a purpose built driver' argument is strong enough > > > here, at least the Kconfig does not seems to be suggesting it is a purpose > > > built driver, perhaps add a 'depend on' to suggest that? > > > > I'm not sure if you have been following the other threads. One of the > > general thoughts of pushback against this driver was that Meta is > > currently the only company that will have possession of this NIC. As > > such Meta will be deciding what systems it goes into and as a result > > of that we aren't likely to be running it on systems with 64K pages. > > Didn't take long for this argument to float to the surface.. This wasn't my full argument. You truncated the part where I specifically called out that it is hard to justify us pushing a proprietary API that is only used by our driver. > We tried to write some rules with Paolo but haven't published them, yet. > Here is one that may be relevant: > > 3. External contributions > ------------------------- > > Owners of drivers for private devices must not exhibit a stronger > sense of ownership or push back on accepting code changes from > members of the community. 3rd party contributions should be evaluated > and eventually accepted, or challenged only on technical arguments > based on the code itself. In particular, the argument that the owner > is the only user and therefore knows best should not be used. > > Not exactly a contribution, but we predicted the "we know best" > tone of the argument :( The "we know best" is more of an "I know best" as someone who has worked with page pool and the page fragment API since well before it existed. My push back is based on the fact that we don't want to allocate fragments, we want to allocate pages and fragment them ourselves after the fact. As such it doesn't make much sense to add an API that will have us trying to use the page fragment API which holds onto the page when the expectation is that we will take the whole thing and just fragment it ourselves. If we are going to use the page fragment API we need the ability to convert a page pool page to a fragment in place, not have it get pulled into the page fragment API and then immediately yanked right back out. On top of that we don't need to be making significant changes to the API that will slow down all the other users to accomodate a driver that will not be used by most users. This is a case where I agree with Jiri. It doesn't make sense to slow down all the other users of the page fragment API by making it so that we can pull variable sliced batches from the page pool fragment interface. It makes much more sense for us to just fragment in place and add as little overhead to the other users of page pool APIs as possible.
On Mon, 15 Apr 2024 11:03:13 -0700 Alexander Duyck wrote: > This wasn't my full argument. You truncated the part where I > specifically called out that it is hard to justify us pushing a > proprietary API that is only used by our driver. I see. Please be careful when making such arguments, tho. > The "we know best" is more of an "I know best" as someone who has > worked with page pool and the page fragment API since well before it > existed. My push back is based on the fact that we don't want to > allocate fragments, we want to allocate pages and fragment them > ourselves after the fact. As such it doesn't make much sense to add an > API that will have us trying to use the page fragment API which holds > onto the page when the expectation is that we will take the whole > thing and just fragment it ourselves. To be clear I'm not arguing for the exact use of the API as suggested. Or even that we should support this in the shared API. One would probably have to take a stab at coding it up to find out what works best. My first try FWIW would be to mask off the low bits of the page index, eg. for 64k page making entries 0-15 all use rx_buf index 0...
On Mon, Apr 15, 2024 at 11:19 AM Jakub Kicinski <kuba@kernel.org> wrote: > > On Mon, 15 Apr 2024 11:03:13 -0700 Alexander Duyck wrote: > > This wasn't my full argument. You truncated the part where I > > specifically called out that it is hard to justify us pushing a > > proprietary API that is only used by our driver. > > I see. Please be careful when making such arguments, tho. > > > The "we know best" is more of an "I know best" as someone who has > > worked with page pool and the page fragment API since well before it > > existed. My push back is based on the fact that we don't want to > > allocate fragments, we want to allocate pages and fragment them > > ourselves after the fact. As such it doesn't make much sense to add an > > API that will have us trying to use the page fragment API which holds > > onto the page when the expectation is that we will take the whole > > thing and just fragment it ourselves. > > To be clear I'm not arguing for the exact use of the API as suggested. > Or even that we should support this in the shared API. One would > probably have to take a stab at coding it up to find out what works > best. My first try FWIW would be to mask off the low bits of the > page index, eg. for 64k page making entries 0-15 all use rx_buf > index 0... It would take a few more changes to make it all work. Basically we would need to map the page into every descriptor entry since the worst case scenario would be that somehow we end up with things getting so tight that the page is only partially mapped and we are working through it as a subset of 4K slices with some at the beginning being unmapped from the descriptor ring while some are still waiting to be assigned to a descriptor and used. What I would probably have to look at doing is adding some sort of cache on the ring to hold onto it while we dole it out 4K at a time to the descriptors. Either that or enforce a hard 16 descriptor limit where we have to assign a full page with every allocation meaning we are at a higher risk for starving the device for memory. The bigger issue would be how could we test it? This is an OCP NIC and as far as I am aware we don't have any systems available that would support a 64K page. I suppose I could rebuild the QEMU for an architecture that supports 64K pages and test it. It would just be painful to have to set up a virtual system to test code that would literally never be used again. I am not sure QEMU can generate enough stress to really test the page allocator and make sure all corner cases are covered.
On Mon, 15 Apr 2024 11:55:37 -0700 Alexander Duyck wrote: > It would take a few more changes to make it all work. Basically we > would need to map the page into every descriptor entry since the worst > case scenario would be that somehow we end up with things getting so > tight that the page is only partially mapped and we are working > through it as a subset of 4K slices with some at the beginning being > unmapped from the descriptor ring while some are still waiting to be > assigned to a descriptor and used. What I would probably have to look > at doing is adding some sort of cache on the ring to hold onto it > while we dole it out 4K at a time to the descriptors. Either that or > enforce a hard 16 descriptor limit where we have to assign a full page > with every allocation meaning we are at a higher risk for starving the > device for memory. Hm, that would be more work, indeed, but potentially beneficial. I was thinking of separating the page allocation and draining logic a bit from the fragment handling logic. #define RXPAGE_IDX(idx) ((idx) >> PAGE_SHIFT - 12) in fbnic_clean_bdq(): while (RXPAGE_IDX(head) != RXPAGE_IDX(hw_head)) refer to rx_buf as: struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx >> LOSE_BITS]; Refill always works in batches of multiple of PAGE_SIZE / 4k. > The bigger issue would be how could we test it? This is an OCP NIC and > as far as I am aware we don't have any systems available that would > support a 64K page. I suppose I could rebuild the QEMU for an > architecture that supports 64K pages and test it. It would just be > painful to have to set up a virtual system to test code that would > literally never be used again. I am not sure QEMU can generate enough > stress to really test the page allocator and make sure all corner > cases are covered. The testing may be tricky. We could possibly test with hacking up the driver to use compound pages (say always allocate 16k) and making sure we don't refer to PAGE_SIZE directly in the test. BTW I have a spreadsheet of "promises", I'd be fine if we set a deadline for FBNIC to gain support for PAGE_SIZE != 4k and Kconfig to x86-only for now..
On Mon, Apr 15, 2024 at 3:01 PM Jakub Kicinski <kuba@kernel.org> wrote: > > On Mon, 15 Apr 2024 11:55:37 -0700 Alexander Duyck wrote: > > It would take a few more changes to make it all work. Basically we > > would need to map the page into every descriptor entry since the worst > > case scenario would be that somehow we end up with things getting so > > tight that the page is only partially mapped and we are working > > through it as a subset of 4K slices with some at the beginning being > > unmapped from the descriptor ring while some are still waiting to be > > assigned to a descriptor and used. What I would probably have to look > > at doing is adding some sort of cache on the ring to hold onto it > > while we dole it out 4K at a time to the descriptors. Either that or > > enforce a hard 16 descriptor limit where we have to assign a full page > > with every allocation meaning we are at a higher risk for starving the > > device for memory. > > Hm, that would be more work, indeed, but potentially beneficial. I was > thinking of separating the page allocation and draining logic a bit > from the fragment handling logic. > > #define RXPAGE_IDX(idx) ((idx) >> PAGE_SHIFT - 12) > > in fbnic_clean_bdq(): > > while (RXPAGE_IDX(head) != RXPAGE_IDX(hw_head)) > > refer to rx_buf as: > > struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx >> LOSE_BITS]; > > Refill always works in batches of multiple of PAGE_SIZE / 4k. > > > The bigger issue would be how could we test it? This is an OCP NIC and > > as far as I am aware we don't have any systems available that would > > support a 64K page. I suppose I could rebuild the QEMU for an > > architecture that supports 64K pages and test it. It would just be > > painful to have to set up a virtual system to test code that would > > literally never be used again. I am not sure QEMU can generate enough > > stress to really test the page allocator and make sure all corner > > cases are covered. > > The testing may be tricky. We could possibly test with hacking up the > driver to use compound pages (say always allocate 16k) and making sure > we don't refer to PAGE_SIZE directly in the test. > > BTW I have a spreadsheet of "promises", I'd be fine if we set a > deadline for FBNIC to gain support for PAGE_SIZE != 4k and Kconfig > to x86-only for now.. Why set a deadline? It doesn't make sense to add as a feature for now. I would be fine with limiting it to x86-only and then stating that if we need to change it to add support for an architecture that does support !4K page size then we can cross that bridge when we get there as it would be much more likely that we would have access to a platform to test it on rather than adding overhead to the code to support a setup that this device may never see in its lifetime.
On Mon, 15 Apr 2024 16:57:54 -0700 Alexander Duyck wrote: > > The testing may be tricky. We could possibly test with hacking up the > > driver to use compound pages (say always allocate 16k) and making sure > > we don't refer to PAGE_SIZE directly in the test. > > > > BTW I have a spreadsheet of "promises", I'd be fine if we set a > > deadline for FBNIC to gain support for PAGE_SIZE != 4k and Kconfig > > to x86-only for now.. > > Why set a deadline? It doesn't make sense to add as a feature for now. Okay, maybe I'm trying to be too nice. Please have all the feedback addressed in v2.
On 2024/4/16 6:01, Jakub Kicinski wrote: > On Mon, 15 Apr 2024 11:55:37 -0700 Alexander Duyck wrote: >> It would take a few more changes to make it all work. Basically we >> would need to map the page into every descriptor entry since the worst >> case scenario would be that somehow we end up with things getting so >> tight that the page is only partially mapped and we are working >> through it as a subset of 4K slices with some at the beginning being >> unmapped from the descriptor ring while some are still waiting to be >> assigned to a descriptor and used. What I would probably have to look >> at doing is adding some sort of cache on the ring to hold onto it >> while we dole it out 4K at a time to the descriptors. Either that or >> enforce a hard 16 descriptor limit where we have to assign a full page >> with every allocation meaning we are at a higher risk for starving the >> device for memory. > > Hm, that would be more work, indeed, but potentially beneficial. I was > thinking of separating the page allocation and draining logic a bit > from the fragment handling logic. > > #define RXPAGE_IDX(idx) ((idx) >> PAGE_SHIFT - 12) > > in fbnic_clean_bdq(): > > while (RXPAGE_IDX(head) != RXPAGE_IDX(hw_head)) > > refer to rx_buf as: > > struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx >> LOSE_BITS]; > > Refill always works in batches of multiple of PAGE_SIZE / 4k. Are we expecting drivers wanting best possible performance doing the above duplicated trick? "grep -rn '_reuse_' drivers/net/ethernet/" seems to suggest that we already have similar trick to do the page spliting in a lot of drivers, I would rather we do not duplicate the above trick again.
From: Alexander Duyck <alexander.duyck@gmail.com> Date: Mon, 15 Apr 2024 11:03:13 -0700 > On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote: >> >> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote: >>>>> The advantage of being a purpose built driver is that we aren't >>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to >>>> >>>> I am not sure if 'being a purpose built driver' argument is strong enough >>>> here, at least the Kconfig does not seems to be suggesting it is a purpose >>>> built driver, perhaps add a 'depend on' to suggest that? >>> >>> I'm not sure if you have been following the other threads. One of the >>> general thoughts of pushback against this driver was that Meta is >>> currently the only company that will have possession of this NIC. As >>> such Meta will be deciding what systems it goes into and as a result >>> of that we aren't likely to be running it on systems with 64K pages. >> >> Didn't take long for this argument to float to the surface.. > > This wasn't my full argument. You truncated the part where I > specifically called out that it is hard to justify us pushing a > proprietary API that is only used by our driver. > >> We tried to write some rules with Paolo but haven't published them, yet. >> Here is one that may be relevant: >> >> 3. External contributions >> ------------------------- >> >> Owners of drivers for private devices must not exhibit a stronger >> sense of ownership or push back on accepting code changes from >> members of the community. 3rd party contributions should be evaluated >> and eventually accepted, or challenged only on technical arguments >> based on the code itself. In particular, the argument that the owner >> is the only user and therefore knows best should not be used. >> >> Not exactly a contribution, but we predicted the "we know best" >> tone of the argument :( > > The "we know best" is more of an "I know best" as someone who has > worked with page pool and the page fragment API since well before it > existed. My push back is based on the fact that we don't want to I still strongly believe Jesper-style arguments like "I've been working with this for aeons", "I invented the Internet", "I was born 3 decades before this API was introduced" are not valid arguments. > allocate fragments, we want to allocate pages and fragment them > ourselves after the fact. As such it doesn't make much sense to add an > API that will have us trying to use the page fragment API which holds > onto the page when the expectation is that we will take the whole > thing and just fragment it ourselves. [...] Re "this HW works only on x86, why bother" -- I still believe there shouldn't be any hardcodes in any driver based on the fact that the HW is deployed only on particular systems. Page sizes, Endianness, 32/64-bit... It's not difficult to make a driver look like it's universal and could work anywhere, really. Thanks, Olek
On Tue, Apr 16, 2024 at 6:25 AM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2024/4/16 6:01, Jakub Kicinski wrote: > > On Mon, 15 Apr 2024 11:55:37 -0700 Alexander Duyck wrote: > >> It would take a few more changes to make it all work. Basically we > >> would need to map the page into every descriptor entry since the worst > >> case scenario would be that somehow we end up with things getting so > >> tight that the page is only partially mapped and we are working > >> through it as a subset of 4K slices with some at the beginning being > >> unmapped from the descriptor ring while some are still waiting to be > >> assigned to a descriptor and used. What I would probably have to look > >> at doing is adding some sort of cache on the ring to hold onto it > >> while we dole it out 4K at a time to the descriptors. Either that or > >> enforce a hard 16 descriptor limit where we have to assign a full page > >> with every allocation meaning we are at a higher risk for starving the > >> device for memory. > > > > Hm, that would be more work, indeed, but potentially beneficial. I was > > thinking of separating the page allocation and draining logic a bit > > from the fragment handling logic. > > > > #define RXPAGE_IDX(idx) ((idx) >> PAGE_SHIFT - 12) > > > > in fbnic_clean_bdq(): > > > > while (RXPAGE_IDX(head) != RXPAGE_IDX(hw_head)) > > > > refer to rx_buf as: > > > > struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx >> LOSE_BITS]; > > > > Refill always works in batches of multiple of PAGE_SIZE / 4k. > > Are we expecting drivers wanting best possible performance doing the > above duplicated trick? > > "grep -rn '_reuse_' drivers/net/ethernet/" seems to suggest that we > already have similar trick to do the page spliting in a lot of drivers, > I would rather we do not duplicate the above trick again. Then why not focus on those drivers? You may have missed the whole point but it isn't possible to test this device on a system with 64K pages currently. There aren't any platforms we can drop the device into that support that.
On Tue, Apr 16, 2024 at 7:05 AM Alexander Lobakin <aleksander.lobakin@intel.com> wrote: > > From: Alexander Duyck <alexander.duyck@gmail.com> > Date: Mon, 15 Apr 2024 11:03:13 -0700 > > > On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote: > >> > >> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote: > >>>>> The advantage of being a purpose built driver is that we aren't > >>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to > >>>> > >>>> I am not sure if 'being a purpose built driver' argument is strong enough > >>>> here, at least the Kconfig does not seems to be suggesting it is a purpose > >>>> built driver, perhaps add a 'depend on' to suggest that? > >>> > >>> I'm not sure if you have been following the other threads. One of the > >>> general thoughts of pushback against this driver was that Meta is > >>> currently the only company that will have possession of this NIC. As > >>> such Meta will be deciding what systems it goes into and as a result > >>> of that we aren't likely to be running it on systems with 64K pages. > >> > >> Didn't take long for this argument to float to the surface.. > > > > This wasn't my full argument. You truncated the part where I > > specifically called out that it is hard to justify us pushing a > > proprietary API that is only used by our driver. > > > >> We tried to write some rules with Paolo but haven't published them, yet. > >> Here is one that may be relevant: > >> > >> 3. External contributions > >> ------------------------- > >> > >> Owners of drivers for private devices must not exhibit a stronger > >> sense of ownership or push back on accepting code changes from > >> members of the community. 3rd party contributions should be evaluated > >> and eventually accepted, or challenged only on technical arguments > >> based on the code itself. In particular, the argument that the owner > >> is the only user and therefore knows best should not be used. > >> > >> Not exactly a contribution, but we predicted the "we know best" > >> tone of the argument :( > > > > The "we know best" is more of an "I know best" as someone who has > > worked with page pool and the page fragment API since well before it > > existed. My push back is based on the fact that we don't want to > > I still strongly believe Jesper-style arguments like "I've been working > with this for aeons", "I invented the Internet", "I was born 3 decades > before this API was introduced" are not valid arguments. Sorry that is a bit of my frustration with Yunsheng coming through. He has another patch set that mostly just moves my code and made himself the maintainer. Admittedly I am a bit annoyed with that. Especially since the main drive seems to be to force everything to use that one approach and then optimize for his use case for vhost net over all others most likely at the expense of everything else. It seems like it is the very thing we were complaining about in patch 0 with other drivers getting penalized at the cost of optimizing for one specific driver. > > allocate fragments, we want to allocate pages and fragment them > > ourselves after the fact. As such it doesn't make much sense to add an > > API that will have us trying to use the page fragment API which holds > > onto the page when the expectation is that we will take the whole > > thing and just fragment it ourselves. > > [...] > > Re "this HW works only on x86, why bother" -- I still believe there > shouldn't be any hardcodes in any driver based on the fact that the HW > is deployed only on particular systems. Page sizes, Endianness, > 32/64-bit... It's not difficult to make a driver look like it's > universal and could work anywhere, really. It isn't that this only works on x86. It is that we can only test it on x86. The biggest issue right now is that I wouldn't have any systems w/ 64K pages that I could test on, and the worst that could happen based on the current code is that the NIC driver will be a memory hog. I would much prefer the potential of being a memory hog on an untested hardware over implementing said code untested and introducing something like a memory leak or page fault issue. That is why I am more than willing to make this an x86_64 only driver for now and we can look at expanding out as I get time and get equipment to to add support and test for other architectures.
> Sorry that is a bit of my frustration with Yunsheng coming through. He > has another patch set that mostly just moves my code and made himself > the maintainer. Admittedly I am a bit annoyed with that. Especially > since the main drive seems to be to force everything to use that one > approach and then optimize for his use case for vhost net over all > others most likely at the expense of everything else. That is why we ask for benchmarks for "optimization patches". If they don't actually provide any improvements, or degrade other use cases, they get rejected. > That is why I am more than willing to make this an x86_64 only driver > for now and we can look at expanding out as I get time and get > equipment to to add support and test for other architectures. That sounds reasonable to me. But i would allow COMPILE_TEST for other architectures, just to keep the number of surprises low when you do have other architectures to test with. Andrew
On Tue, Apr 16, 2024 at 07:46:06AM -0700, Alexander Duyck wrote: > On Tue, Apr 16, 2024 at 7:05 AM Alexander Lobakin > <aleksander.lobakin@intel.com> wrote: > > > > From: Alexander Duyck <alexander.duyck@gmail.com> > > Date: Mon, 15 Apr 2024 11:03:13 -0700 > > > > > On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote: > > >> > > >> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote: > > >>>>> The advantage of being a purpose built driver is that we aren't > > >>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to > > >>>> > > >>>> I am not sure if 'being a purpose built driver' argument is strong enough > > >>>> here, at least the Kconfig does not seems to be suggesting it is a purpose > > >>>> built driver, perhaps add a 'depend on' to suggest that? > > >>> > > >>> I'm not sure if you have been following the other threads. One of the > > >>> general thoughts of pushback against this driver was that Meta is > > >>> currently the only company that will have possession of this NIC. As > > >>> such Meta will be deciding what systems it goes into and as a result > > >>> of that we aren't likely to be running it on systems with 64K pages. > > >> > > >> Didn't take long for this argument to float to the surface.. > > > > > > This wasn't my full argument. You truncated the part where I > > > specifically called out that it is hard to justify us pushing a > > > proprietary API that is only used by our driver. > > > > > >> We tried to write some rules with Paolo but haven't published them, yet. > > >> Here is one that may be relevant: > > >> > > >> 3. External contributions > > >> ------------------------- > > >> > > >> Owners of drivers for private devices must not exhibit a stronger > > >> sense of ownership or push back on accepting code changes from > > >> members of the community. 3rd party contributions should be evaluated > > >> and eventually accepted, or challenged only on technical arguments > > >> based on the code itself. In particular, the argument that the owner > > >> is the only user and therefore knows best should not be used. > > >> > > >> Not exactly a contribution, but we predicted the "we know best" > > >> tone of the argument :( > > > > > > The "we know best" is more of an "I know best" as someone who has > > > worked with page pool and the page fragment API since well before it > > > existed. My push back is based on the fact that we don't want to > > > > I still strongly believe Jesper-style arguments like "I've been working > > with this for aeons", "I invented the Internet", "I was born 3 decades > > before this API was introduced" are not valid arguments. > > Sorry that is a bit of my frustration with Yunsheng coming through. He > has another patch set that mostly just moves my code and made himself > the maintainer. Admittedly I am a bit annoyed with that. Especially > since the main drive seems to be to force everything to use that one > approach and then optimize for his use case for vhost net over all > others most likely at the expense of everything else. > > It seems like it is the very thing we were complaining about in patch > 0 with other drivers getting penalized at the cost of optimizing for > one specific driver. > > > > allocate fragments, we want to allocate pages and fragment them > > > ourselves after the fact. As such it doesn't make much sense to add an > > > API that will have us trying to use the page fragment API which holds > > > onto the page when the expectation is that we will take the whole > > > thing and just fragment it ourselves. > > > > [...] > > > > Re "this HW works only on x86, why bother" -- I still believe there > > shouldn't be any hardcodes in any driver based on the fact that the HW > > is deployed only on particular systems. Page sizes, Endianness, > > 32/64-bit... It's not difficult to make a driver look like it's > > universal and could work anywhere, really. > > It isn't that this only works on x86. It is that we can only test it > on x86. The biggest issue right now is that I wouldn't have any > systems w/ 64K pages that I could test on. Didn't you write that you will provide QEMU emulation for this device? Thanks
From: Alexander Duyck <alexander.duyck@gmail.com> Date: Tue, 16 Apr 2024 07:46:06 -0700 > On Tue, Apr 16, 2024 at 7:05 AM Alexander Lobakin > <aleksander.lobakin@intel.com> wrote: >> >> From: Alexander Duyck <alexander.duyck@gmail.com> >> Date: Mon, 15 Apr 2024 11:03:13 -0700 >> >>> On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote: >>>> >>>> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote: >>>>>>> The advantage of being a purpose built driver is that we aren't >>>>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to >>>>>> >>>>>> I am not sure if 'being a purpose built driver' argument is strong enough >>>>>> here, at least the Kconfig does not seems to be suggesting it is a purpose >>>>>> built driver, perhaps add a 'depend on' to suggest that? >>>>> >>>>> I'm not sure if you have been following the other threads. One of the >>>>> general thoughts of pushback against this driver was that Meta is >>>>> currently the only company that will have possession of this NIC. As >>>>> such Meta will be deciding what systems it goes into and as a result >>>>> of that we aren't likely to be running it on systems with 64K pages. >>>> >>>> Didn't take long for this argument to float to the surface.. >>> >>> This wasn't my full argument. You truncated the part where I >>> specifically called out that it is hard to justify us pushing a >>> proprietary API that is only used by our driver. >>> >>>> We tried to write some rules with Paolo but haven't published them, yet. >>>> Here is one that may be relevant: >>>> >>>> 3. External contributions >>>> ------------------------- >>>> >>>> Owners of drivers for private devices must not exhibit a stronger >>>> sense of ownership or push back on accepting code changes from >>>> members of the community. 3rd party contributions should be evaluated >>>> and eventually accepted, or challenged only on technical arguments >>>> based on the code itself. In particular, the argument that the owner >>>> is the only user and therefore knows best should not be used. >>>> >>>> Not exactly a contribution, but we predicted the "we know best" >>>> tone of the argument :( >>> >>> The "we know best" is more of an "I know best" as someone who has >>> worked with page pool and the page fragment API since well before it >>> existed. My push back is based on the fact that we don't want to >> >> I still strongly believe Jesper-style arguments like "I've been working >> with this for aeons", "I invented the Internet", "I was born 3 decades >> before this API was introduced" are not valid arguments. > > Sorry that is a bit of my frustration with Yunsheng coming through. He > has another patch set that mostly just moves my code and made himself > the maintainer. Admittedly I am a bit annoyed with that. Especially > since the main drive seems to be to force everything to use that one > approach and then optimize for his use case for vhost net over all > others most likely at the expense of everything else. > > It seems like it is the very thing we were complaining about in patch > 0 with other drivers getting penalized at the cost of optimizing for > one specific driver. > >>> allocate fragments, we want to allocate pages and fragment them >>> ourselves after the fact. As such it doesn't make much sense to add an >>> API that will have us trying to use the page fragment API which holds >>> onto the page when the expectation is that we will take the whole >>> thing and just fragment it ourselves. >> >> [...] >> >> Re "this HW works only on x86, why bother" -- I still believe there >> shouldn't be any hardcodes in any driver based on the fact that the HW >> is deployed only on particular systems. Page sizes, Endianness, >> 32/64-bit... It's not difficult to make a driver look like it's >> universal and could work anywhere, really. > > It isn't that this only works on x86. It is that we can only test it > on x86. The biggest issue right now is that I wouldn't have any > systems w/ 64K pages that I could test on, and the worst that could > happen based on the current code is that the NIC driver will be a > memory hog. > > I would much prefer the potential of being a memory hog on an untested > hardware over implementing said code untested and introducing > something like a memory leak or page fault issue. > > That is why I am more than willing to make this an x86_64 only driver > for now and we can look at expanding out as I get time and get > equipment to to add support and test for other architectures. I don't see any issue to not limit it to x86_64 only. It compiles just fine and if you run it later on a non-x86_64 system, you'll test it then. I don't think anyone will run it on a different platform prior to that. Thanks, Olek
On Wed, Apr 17, 2024 at 1:14 AM Leon Romanovsky <leon@kernel.org> wrote: > > On Tue, Apr 16, 2024 at 07:46:06AM -0700, Alexander Duyck wrote: > > On Tue, Apr 16, 2024 at 7:05 AM Alexander Lobakin > > <aleksander.lobakin@intel.com> wrote: > > > > > > From: Alexander Duyck <alexander.duyck@gmail.com> > > > Date: Mon, 15 Apr 2024 11:03:13 -0700 > > > > > > > On Mon, Apr 15, 2024 at 10:11 AM Jakub Kicinski <kuba@kernel.org> wrote: > > > >> > > > >> On Mon, 15 Apr 2024 08:03:38 -0700 Alexander Duyck wrote: > > > >>>>> The advantage of being a purpose built driver is that we aren't > > > >>>>> running on any architectures where the PAGE_SIZE > 4K. If it came to > > > >>>> > > > >>>> I am not sure if 'being a purpose built driver' argument is strong enough > > > >>>> here, at least the Kconfig does not seems to be suggesting it is a purpose > > > >>>> built driver, perhaps add a 'depend on' to suggest that? > > > >>> > > > >>> I'm not sure if you have been following the other threads. One of the > > > >>> general thoughts of pushback against this driver was that Meta is > > > >>> currently the only company that will have possession of this NIC. As > > > >>> such Meta will be deciding what systems it goes into and as a result > > > >>> of that we aren't likely to be running it on systems with 64K pages. > > > >> > > > >> Didn't take long for this argument to float to the surface.. > > > > > > > > This wasn't my full argument. You truncated the part where I > > > > specifically called out that it is hard to justify us pushing a > > > > proprietary API that is only used by our driver. > > > > > > > >> We tried to write some rules with Paolo but haven't published them, yet. > > > >> Here is one that may be relevant: > > > >> > > > >> 3. External contributions > > > >> ------------------------- > > > >> > > > >> Owners of drivers for private devices must not exhibit a stronger > > > >> sense of ownership or push back on accepting code changes from > > > >> members of the community. 3rd party contributions should be evaluated > > > >> and eventually accepted, or challenged only on technical arguments > > > >> based on the code itself. In particular, the argument that the owner > > > >> is the only user and therefore knows best should not be used. > > > >> > > > >> Not exactly a contribution, but we predicted the "we know best" > > > >> tone of the argument :( > > > > > > > > The "we know best" is more of an "I know best" as someone who has > > > > worked with page pool and the page fragment API since well before it > > > > existed. My push back is based on the fact that we don't want to > > > > > > I still strongly believe Jesper-style arguments like "I've been working > > > with this for aeons", "I invented the Internet", "I was born 3 decades > > > before this API was introduced" are not valid arguments. > > > > Sorry that is a bit of my frustration with Yunsheng coming through. He > > has another patch set that mostly just moves my code and made himself > > the maintainer. Admittedly I am a bit annoyed with that. Especially > > since the main drive seems to be to force everything to use that one > > approach and then optimize for his use case for vhost net over all > > others most likely at the expense of everything else. > > > > It seems like it is the very thing we were complaining about in patch > > 0 with other drivers getting penalized at the cost of optimizing for > > one specific driver. > > > > > > allocate fragments, we want to allocate pages and fragment them > > > > ourselves after the fact. As such it doesn't make much sense to add an > > > > API that will have us trying to use the page fragment API which holds > > > > onto the page when the expectation is that we will take the whole > > > > thing and just fragment it ourselves. > > > > > > [...] > > > > > > Re "this HW works only on x86, why bother" -- I still believe there > > > shouldn't be any hardcodes in any driver based on the fact that the HW > > > is deployed only on particular systems. Page sizes, Endianness, > > > 32/64-bit... It's not difficult to make a driver look like it's > > > universal and could work anywhere, really. > > > > It isn't that this only works on x86. It is that we can only test it > > on x86. The biggest issue right now is that I wouldn't have any > > systems w/ 64K pages that I could test on. > > Didn't you write that you will provide QEMU emulation for this device? > > Thanks Yes. I had already mentioned the possibility of testing it this way. I am just not sure it adds much value to test the already limited hardware and a limited platform setup in emulation. The issue is that it will be hard to generate any stress in the QEMU environment since it maxes out at only about 1 or 2 Gbps as the overhead for providing TCAMs is software is not insignificant. To top it off, emulating a non-native architecture will slow things down further. It would be like asking someone to test a 100G nic on a system that only has PCIe gen1. I will probably just go the suggested route of enabling compile testing on all platforms, and only support loading it on X86_64. As time permits I can probably assign somebody the job of exploring the page size larger than 4K issue, however it will be a matter of weighing the trade-off for adding technical debt for a use case that may not be applicable.
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index 0819ddc1dcc8..f61b401fdd5c 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -90,6 +90,66 @@ enum { #define FBNIC_TCD_DONE DESC_BIT(63) +/* Rx Completion Queue Descriptors */ +#define FBNIC_RCD_TYPE_MASK DESC_GENMASK(62, 61) +enum { + FBNIC_RCD_TYPE_HDR_AL = 0, + FBNIC_RCD_TYPE_PAY_AL = 1, + FBNIC_RCD_TYPE_OPT_META = 2, + FBNIC_RCD_TYPE_META = 3, +}; + +#define FBNIC_RCD_DONE DESC_BIT(63) + +/* Address/Length Completion Descriptors */ +#define FBNIC_RCD_AL_BUFF_ID_MASK DESC_GENMASK(15, 0) +#define FBNIC_RCD_AL_BUFF_LEN_MASK DESC_GENMASK(28, 16) +#define FBNIC_RCD_AL_BUFF_OFF_MASK DESC_GENMASK(43, 32) +#define FBNIC_RCD_AL_PAGE_FIN DESC_BIT(60) + +/* Header AL specific values */ +#define FBNIC_RCD_HDR_AL_OVERFLOW DESC_BIT(53) +#define FBNIC_RCD_HDR_AL_DMA_HINT_MASK DESC_GENMASK(59, 54) +enum { + FBNIC_RCD_HDR_AL_DMA_HINT_NONE = 0, + FBNIC_RCD_HDR_AL_DMA_HINT_L2 = 1, + FBNIC_RCD_HDR_AL_DMA_HINT_L3 = 2, + FBNIC_RCD_HDR_AL_DMA_HINT_L4 = 4, +}; + +/* Optional Metadata Completion Descriptors */ +#define FBNIC_RCD_OPT_META_TS_MASK DESC_GENMASK(39, 0) +#define FBNIC_RCD_OPT_META_ACTION_MASK DESC_GENMASK(45, 40) +#define FBNIC_RCD_OPT_META_ACTION DESC_BIT(57) +#define FBNIC_RCD_OPT_META_TS DESC_BIT(58) +#define FBNIC_RCD_OPT_META_TYPE_MASK DESC_GENMASK(60, 59) + +/* Metadata Completion Descriptors */ +#define FBNIC_RCD_META_RSS_HASH_MASK DESC_GENMASK(31, 0) +#define FBNIC_RCD_META_L2_CSUM_MASK DESC_GENMASK(47, 32) +#define FBNIC_RCD_META_L3_TYPE_MASK DESC_GENMASK(49, 48) +enum { + FBNIC_RCD_META_L3_TYPE_OTHER = 0, + FBNIC_RCD_META_L3_TYPE_IPV4 = 1, + FBNIC_RCD_META_L3_TYPE_IPV6 = 2, + FBNIC_RCD_META_L3_TYPE_V6V6 = 3, +}; + +#define FBNIC_RCD_META_L4_TYPE_MASK DESC_GENMASK(51, 50) +enum { + FBNIC_RCD_META_L4_TYPE_OTHER = 0, + FBNIC_RCD_META_L4_TYPE_TCP = 1, + FBNIC_RCD_META_L4_TYPE_UDP = 2, +}; + +#define FBNIC_RCD_META_L4_CSUM_UNNECESSARY DESC_BIT(52) +#define FBNIC_RCD_META_ERR_MAC_EOP DESC_BIT(53) +#define FBNIC_RCD_META_ERR_TRUNCATED_FRAME DESC_BIT(54) +#define FBNIC_RCD_META_ERR_PARSER DESC_BIT(55) +#define FBNIC_RCD_META_UNCORRECTABLE_ERR_MASK \ + (FBNIC_RCD_META_ERR_MAC_EOP | FBNIC_RCD_META_ERR_TRUNCATED_FRAME) +#define FBNIC_RCD_META_ECN DESC_BIT(60) + /* Rx Buffer Descriptor Format */ #define FBNIC_BD_PAGE_ADDR_MASK DESC_GENMASK(45, 12) #define FBNIC_BD_PAGE_ID_MASK DESC_GENMASK(63, 48) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index 91d4ea2bfb29..792bdfa7429d 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -150,8 +150,10 @@ struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd) fbnic_reset_queues(fbn, default_queues, default_queues); netdev->features |= + NETIF_F_RXHASH | NETIF_F_SG | - NETIF_F_HW_CSUM; + NETIF_F_HW_CSUM | + NETIF_F_RXCSUM; netdev->hw_features |= netdev->features; netdev->vlan_features |= netdev->features; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index f243950c68bb..d897b0d65abf 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -268,6 +268,9 @@ static void fbnic_service_task(struct work_struct *work) fbnic_health_check(fbd); + if (netif_carrier_ok(fbd->netdev)) + fbnic_napi_depletion_check(fbd->netdev); + if (netif_running(fbd->netdev)) schedule_delayed_work(&fbd->service_task, HZ); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c index ad4cb059c959..fe35112b9075 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c @@ -131,6 +131,24 @@ fbnic_tx_offloads(struct fbnic_ring *ring, struct sk_buff *skb, __le64 *meta) return false; } +static void +fbnic_rx_csum(u64 rcd, struct sk_buff *skb, struct fbnic_ring *rcq) +{ + skb_checksum_none_assert(skb); + + if (unlikely(!(skb->dev->features & NETIF_F_RXCSUM))) + return; + + if (FIELD_GET(FBNIC_RCD_META_L4_CSUM_UNNECESSARY, rcd)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + u16 csum = FIELD_GET(FBNIC_RCD_META_L2_CSUM_MASK, rcd); + + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum = (__force __wsum)csum; + } +} + static bool fbnic_tx_map(struct fbnic_ring *ring, struct sk_buff *skb, __le64 *meta) { @@ -364,6 +382,16 @@ static void fbnic_page_pool_init(struct fbnic_ring *ring, unsigned int idx, rx_buf->page = page; } +static struct page *fbnic_page_pool_get(struct fbnic_ring *ring, + unsigned int idx) +{ + struct fbnic_rx_buf *rx_buf = &ring->rx_buf[idx]; + + rx_buf->pagecnt_bias--; + + return rx_buf->page; +} + static void fbnic_page_pool_drain(struct fbnic_ring *ring, unsigned int idx, struct fbnic_napi_vector *nv, int budget) { @@ -501,6 +529,98 @@ static void fbnic_fill_bdq(struct fbnic_napi_vector *nv, struct fbnic_ring *bdq) } } +static unsigned int fbnic_hdr_pg_start(unsigned int pg_off) +{ + /* The headroom of the first header may be larger than FBNIC_RX_HROOM + * due to alignment. So account for that by just making the page + * offset 0 if we are starting at the first header. + */ + if (ALIGN(FBNIC_RX_HROOM, 128) > FBNIC_RX_HROOM && + pg_off == ALIGN(FBNIC_RX_HROOM, 128)) + return 0; + + return pg_off - FBNIC_RX_HROOM; +} + +static unsigned int fbnic_hdr_pg_end(unsigned int pg_off, unsigned int len) +{ + /* Determine the end of the buffer by finding the start of the next + * and then subtracting the headroom from that frame. + */ + pg_off += len + FBNIC_RX_TROOM + FBNIC_RX_HROOM; + + return ALIGN(pg_off, 128) - FBNIC_RX_HROOM; +} + +static void fbnic_pkt_prepare(struct fbnic_napi_vector *nv, u64 rcd, + struct fbnic_pkt_buff *pkt, + struct fbnic_q_triad *qt) +{ + unsigned int hdr_pg_off = FIELD_GET(FBNIC_RCD_AL_BUFF_OFF_MASK, rcd); + unsigned int hdr_pg_idx = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); + struct page *page = fbnic_page_pool_get(&qt->sub0, hdr_pg_idx); + unsigned int len = FIELD_GET(FBNIC_RCD_AL_BUFF_LEN_MASK, rcd); + unsigned int frame_sz, hdr_pg_start, hdr_pg_end, headroom; + unsigned char *hdr_start; + + /* data_hard_start should always be NULL when this is called */ + WARN_ON_ONCE(pkt->buff.data_hard_start); + + /* Short-cut the end caclulation if we know page is fully consumed */ + hdr_pg_end = FIELD_GET(FBNIC_RCD_AL_PAGE_FIN, rcd) ? + PAGE_SIZE : fbnic_hdr_pg_end(hdr_pg_off, len); + hdr_pg_start = fbnic_hdr_pg_start(hdr_pg_off); + + frame_sz = hdr_pg_end - hdr_pg_start; + xdp_init_buff(&pkt->buff, frame_sz, NULL); + + /* Sync DMA buffer */ + dma_sync_single_range_for_cpu(nv->dev, page_pool_get_dma_addr(page), + hdr_pg_start, frame_sz, + DMA_BIDIRECTIONAL); + + /* Build frame around buffer */ + hdr_start = page_address(page) + hdr_pg_start; + headroom = hdr_pg_off - hdr_pg_start + FBNIC_RX_PAD; + + xdp_prepare_buff(&pkt->buff, hdr_start, headroom, + len - FBNIC_RX_PAD, true); + + pkt->data_truesize = 0; + pkt->data_len = 0; + pkt->nr_frags = 0; +} + +static void fbnic_add_rx_frag(struct fbnic_napi_vector *nv, u64 rcd, + struct fbnic_pkt_buff *pkt, + struct fbnic_q_triad *qt) +{ + unsigned int pg_off = FIELD_GET(FBNIC_RCD_AL_BUFF_OFF_MASK, rcd); + unsigned int pg_idx = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); + unsigned int len = FIELD_GET(FBNIC_RCD_AL_BUFF_LEN_MASK, rcd); + struct page *page = fbnic_page_pool_get(&qt->sub1, pg_idx); + struct skb_shared_info *shinfo; + unsigned int truesize; + + truesize = FIELD_GET(FBNIC_RCD_AL_PAGE_FIN, rcd) ? PAGE_SIZE - pg_off : + ALIGN(len, 128); + + /* Sync DMA buffer */ + dma_sync_single_range_for_cpu(nv->dev, page_pool_get_dma_addr(page), + pg_off, truesize, DMA_BIDIRECTIONAL); + + /* Add page to xdp shared info */ + shinfo = xdp_get_shared_info_from_buff(&pkt->buff); + + /* We use gso_segs to store truesize */ + pkt->data_truesize += truesize; + + __skb_fill_page_desc_noacc(shinfo, pkt->nr_frags++, page, pg_off, len); + + /* Store data_len in gso_size */ + pkt->data_len += len; +} + static void fbnic_put_pkt_buff(struct fbnic_napi_vector *nv, struct fbnic_pkt_buff *pkt, int budget) { @@ -522,7 +642,167 @@ static void fbnic_put_pkt_buff(struct fbnic_napi_vector *nv, page = virt_to_page(pkt->buff.data_hard_start); page_pool_put_full_page(nv->page_pool, page, !!budget); pkt->buff.data_hard_start = NULL; -}; +} + +static struct sk_buff *fbnic_build_skb(struct fbnic_napi_vector *nv, + struct fbnic_pkt_buff *pkt) +{ + unsigned int nr_frags = pkt->nr_frags; + struct skb_shared_info *shinfo; + unsigned int truesize; + struct sk_buff *skb; + + truesize = xdp_data_hard_end(&pkt->buff) + FBNIC_RX_TROOM - + pkt->buff.data_hard_start; + + /* Build frame around buffer */ + skb = napi_build_skb(pkt->buff.data_hard_start, truesize); + if (unlikely(!skb)) + return NULL; + + /* Push data pointer to start of data, put tail to end of data */ + skb_reserve(skb, pkt->buff.data - pkt->buff.data_hard_start); + __skb_put(skb, pkt->buff.data_end - pkt->buff.data); + + /* Add tracking for metadata at the start of the frame */ + skb_metadata_set(skb, pkt->buff.data - pkt->buff.data_meta); + + /* Add Rx frags */ + if (nr_frags) { + /* Verify that shared info didn't move */ + shinfo = xdp_get_shared_info_from_buff(&pkt->buff); + WARN_ON(skb_shinfo(skb) != shinfo); + + skb->truesize += pkt->data_truesize; + skb->data_len += pkt->data_len; + shinfo->nr_frags = nr_frags; + skb->len += pkt->data_len; + } + + skb_mark_for_recycle(skb); + + /* Set MAC header specific fields */ + skb->protocol = eth_type_trans(skb, nv->napi.dev); + + return skb; +} + +static enum pkt_hash_types fbnic_skb_hash_type(u64 rcd) +{ + return (FBNIC_RCD_META_L4_TYPE_MASK & rcd) ? PKT_HASH_TYPE_L4 : + (FBNIC_RCD_META_L3_TYPE_MASK & rcd) ? PKT_HASH_TYPE_L3 : + PKT_HASH_TYPE_L2; +} + +static void fbnic_populate_skb_fields(struct fbnic_napi_vector *nv, + u64 rcd, struct sk_buff *skb, + struct fbnic_q_triad *qt) +{ + struct net_device *netdev = nv->napi.dev; + struct fbnic_ring *rcq = &qt->cmpl; + + fbnic_rx_csum(rcd, skb, rcq); + + if (netdev->features & NETIF_F_RXHASH) + skb_set_hash(skb, + FIELD_GET(FBNIC_RCD_META_RSS_HASH_MASK, rcd), + fbnic_skb_hash_type(rcd)); + + skb_record_rx_queue(skb, rcq->q_idx); +} + +static bool fbnic_rcd_metadata_err(u64 rcd) +{ + return !!(FBNIC_RCD_META_UNCORRECTABLE_ERR_MASK & rcd); +} + +static int fbnic_clean_rcq(struct fbnic_napi_vector *nv, + struct fbnic_q_triad *qt, int budget) +{ + struct fbnic_ring *rcq = &qt->cmpl; + struct fbnic_pkt_buff *pkt; + s32 head0 = -1, head1 = -1; + __le64 *raw_rcd, done; + u32 head = rcq->head; + u64 packets = 0; + + done = (head & (rcq->size_mask + 1)) ? cpu_to_le64(FBNIC_RCD_DONE) : 0; + raw_rcd = &rcq->desc[head & rcq->size_mask]; + pkt = rcq->pkt; + + /* Walk the completion queue collecting the heads reported by NIC */ + while (likely(packets < budget)) { + struct sk_buff *skb = ERR_PTR(-EINVAL); + u64 rcd; + + if ((*raw_rcd & cpu_to_le64(FBNIC_RCD_DONE)) == done) + break; + + dma_rmb(); + + rcd = le64_to_cpu(*raw_rcd); + + switch (FIELD_GET(FBNIC_RCD_TYPE_MASK, rcd)) { + case FBNIC_RCD_TYPE_HDR_AL: + head0 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); + fbnic_pkt_prepare(nv, rcd, pkt, qt); + + break; + case FBNIC_RCD_TYPE_PAY_AL: + head1 = FIELD_GET(FBNIC_RCD_AL_BUFF_ID_MASK, rcd); + fbnic_add_rx_frag(nv, rcd, pkt, qt); + + break; + case FBNIC_RCD_TYPE_OPT_META: + /* Only type 0 is currently supported */ + if (FIELD_GET(FBNIC_RCD_OPT_META_TYPE_MASK, rcd)) + break; + + /* We currently ignore the action table index */ + break; + case FBNIC_RCD_TYPE_META: + if (likely(!fbnic_rcd_metadata_err(rcd))) + skb = fbnic_build_skb(nv, pkt); + + /* populate skb and invalidate XDP */ + if (!IS_ERR_OR_NULL(skb)) { + fbnic_populate_skb_fields(nv, rcd, skb, qt); + + packets++; + + napi_gro_receive(&nv->napi, skb); + } + + pkt->buff.data_hard_start = NULL; + + break; + } + + raw_rcd++; + head++; + if (!(head & rcq->size_mask)) { + done ^= cpu_to_le64(FBNIC_RCD_DONE); + raw_rcd = &rcq->desc[0]; + } + } + + /* Unmap and free processed buffers */ + if (head0 >= 0) + fbnic_clean_bdq(nv, budget, &qt->sub0, head0); + fbnic_fill_bdq(nv, &qt->sub0); + + if (head1 >= 0) + fbnic_clean_bdq(nv, budget, &qt->sub1, head1); + fbnic_fill_bdq(nv, &qt->sub1); + + /* Record the current head/tail of the queue */ + if (rcq->head != head) { + rcq->head = head; + writel(head & rcq->size_mask, rcq->doorbell); + } + + return packets; +} static void fbnic_nv_irq_disable(struct fbnic_napi_vector *nv) { @@ -545,12 +825,19 @@ static int fbnic_poll(struct napi_struct *napi, int budget) struct fbnic_napi_vector *nv = container_of(napi, struct fbnic_napi_vector, napi); - int i; + int i, j, work_done = 0; for (i = 0; i < nv->txt_count; i++) fbnic_clean_tcq(nv, &nv->qt[i], budget); - if (likely(napi_complete_done(napi, 0))) + if (budget) + for (j = 0; j < nv->rxt_count; j++, i++) + work_done += fbnic_clean_rcq(nv, &nv->qt[i], budget); + + if (work_done >= budget) + return budget; + + if (likely(napi_complete_done(napi, work_done))) fbnic_nv_irq_rearm(nv); return 0; @@ -1555,3 +1842,32 @@ void fbnic_napi_enable(struct fbnic_net *fbn) } wrfl(); } + +void fbnic_napi_depletion_check(struct net_device *netdev) +{ + struct fbnic_net *fbn = netdev_priv(netdev); + u32 irqs[FBNIC_MAX_MSIX_VECS / 32] = {}; + struct fbnic_dev *fbd = fbn->fbd; + struct fbnic_napi_vector *nv; + int i, j; + + list_for_each_entry(nv, &fbn->napis, napis) { + /* Find RQs which are completely out of pages */ + for (i = nv->txt_count, j = 0; j < nv->rxt_count; j++, i++) { + /* Assume 4 pages is always enough to fit a packet + * and therefore generate a completion and an IRQ. + */ + if (fbnic_desc_used(&nv->qt[i].sub0) < 4 || + fbnic_desc_used(&nv->qt[i].sub1) < 4) + irqs[nv->v_idx / 32] |= BIT(nv->v_idx % 32); + } + } + + for (i = 0; i < ARRAY_SIZE(irqs); i++) { + if (!irqs[i]) + continue; + wr32(FBNIC_INTR_MASK_CLEAR(i), irqs[i]); + wr32(FBNIC_INTR_SET(i), irqs[i]); + } + wrfl(); +} diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h index 0c424c49866d..4e43d41c781a 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.h @@ -5,6 +5,7 @@ #define _FBNIC_TXRX_H_ #include <linux/netdevice.h> +#include <linux/skbuff.h> #include <linux/types.h> #include <net/xdp.h> @@ -118,6 +119,7 @@ void fbnic_config_drop_mode(struct fbnic_net *fbn); void fbnic_flush(struct fbnic_net *fbn); void fbnic_fill(struct fbnic_net *fbn); +void fbnic_napi_depletion_check(struct net_device *netdev); int fbnic_wait_all_queues_idle(struct fbnic_dev *fbd, bool may_fail); #endif /* _FBNIC_TXRX_H_ */