diff mbox series

[vhost,3/6] virtio_net: replace private by pp struct inside page

Message ID 20240411025127.51945-4-xuanzhuo@linux.alibaba.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series virtio_net: rx enable premapped mode by default | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 942 this patch: 942
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 8 of 8 maintainers
netdev/build_clang success Errors and warnings before: 953 this patch: 953
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 953 this patch: 953
netdev/checkpatch fail ERROR: trailing statements should be on next line
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Xuan Zhuo April 11, 2024, 2:51 a.m. UTC
Now, we chain the pages of big mode by the page's private variable.
But a subsequent patch aims to make the big mode to support
premapped mode. This requires additional space to store the dma addr.

Within the sub-struct that contains the 'private', there is no suitable
variable for storing the DMA addr.

		struct {	/* Page cache and anonymous pages */
			/**
			 * @lru: Pageout list, eg. active_list protected by
			 * lruvec->lru_lock.  Sometimes used as a generic list
			 * by the page owner.
			 */
			union {
				struct list_head lru;

				/* Or, for the Unevictable "LRU list" slot */
				struct {
					/* Always even, to negate PageTail */
					void *__filler;
					/* Count page's or folio's mlocks */
					unsigned int mlock_count;
				};

				/* Or, free page */
				struct list_head buddy_list;
				struct list_head pcp_list;
			};
			/* See page-flags.h for PAGE_MAPPING_FLAGS */
			struct address_space *mapping;
			union {
				pgoff_t index;		/* Our offset within mapping. */
				unsigned long share;	/* share count for fsdax */
			};
			/**
			 * @private: Mapping-private opaque data.
			 * Usually used for buffer_heads if PagePrivate.
			 * Used for swp_entry_t if PageSwapCache.
			 * Indicates order in the buddy system if PageBuddy.
			 */
			unsigned long private;
		};

But within the page pool struct, we have a variable called
dma_addr that is appropriate for storing dma addr.
And that struct is used by netstack. That works to our advantage.

		struct {	/* page_pool used by netstack */
			/**
			 * @pp_magic: magic value to avoid recycling non
			 * page_pool allocated pages.
			 */
			unsigned long pp_magic;
			struct page_pool *pp;
			unsigned long _pp_mapping_pad;
			unsigned long dma_addr;
			atomic_long_t pp_ref_count;
		};

On the other side, we should use variables from the same sub-struct.
So this patch replaces the "private" with "pp".

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/virtio_net.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

Comments

Jason Wang April 12, 2024, 4:47 a.m. UTC | #1
On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> Now, we chain the pages of big mode by the page's private variable.
> But a subsequent patch aims to make the big mode to support
> premapped mode. This requires additional space to store the dma addr.
>
> Within the sub-struct that contains the 'private', there is no suitable
> variable for storing the DMA addr.
>
>                 struct {        /* Page cache and anonymous pages */
>                         /**
>                          * @lru: Pageout list, eg. active_list protected by
>                          * lruvec->lru_lock.  Sometimes used as a generic list
>                          * by the page owner.
>                          */
>                         union {
>                                 struct list_head lru;
>
>                                 /* Or, for the Unevictable "LRU list" slot */
>                                 struct {
>                                         /* Always even, to negate PageTail */
>                                         void *__filler;
>                                         /* Count page's or folio's mlocks */
>                                         unsigned int mlock_count;
>                                 };
>
>                                 /* Or, free page */
>                                 struct list_head buddy_list;
>                                 struct list_head pcp_list;
>                         };
>                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
>                         struct address_space *mapping;
>                         union {
>                                 pgoff_t index;          /* Our offset within mapping. */
>                                 unsigned long share;    /* share count for fsdax */
>                         };
>                         /**
>                          * @private: Mapping-private opaque data.
>                          * Usually used for buffer_heads if PagePrivate.
>                          * Used for swp_entry_t if PageSwapCache.
>                          * Indicates order in the buddy system if PageBuddy.
>                          */
>                         unsigned long private;
>                 };
>
> But within the page pool struct, we have a variable called
> dma_addr that is appropriate for storing dma addr.
> And that struct is used by netstack. That works to our advantage.
>
>                 struct {        /* page_pool used by netstack */
>                         /**
>                          * @pp_magic: magic value to avoid recycling non
>                          * page_pool allocated pages.
>                          */
>                         unsigned long pp_magic;
>                         struct page_pool *pp;
>                         unsigned long _pp_mapping_pad;
>                         unsigned long dma_addr;
>                         atomic_long_t pp_ref_count;
>                 };
>
> On the other side, we should use variables from the same sub-struct.
> So this patch replaces the "private" with "pp".
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---

Instead of doing a customized version of page pool, can we simply
switch to use page pool for big mode instead? Then we don't need to
bother the dma stuffs.

Thanks
Xuan Zhuo April 12, 2024, 5:35 a.m. UTC | #2
On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > Now, we chain the pages of big mode by the page's private variable.
> > But a subsequent patch aims to make the big mode to support
> > premapped mode. This requires additional space to store the dma addr.
> >
> > Within the sub-struct that contains the 'private', there is no suitable
> > variable for storing the DMA addr.
> >
> >                 struct {        /* Page cache and anonymous pages */
> >                         /**
> >                          * @lru: Pageout list, eg. active_list protected by
> >                          * lruvec->lru_lock.  Sometimes used as a generic list
> >                          * by the page owner.
> >                          */
> >                         union {
> >                                 struct list_head lru;
> >
> >                                 /* Or, for the Unevictable "LRU list" slot */
> >                                 struct {
> >                                         /* Always even, to negate PageTail */
> >                                         void *__filler;
> >                                         /* Count page's or folio's mlocks */
> >                                         unsigned int mlock_count;
> >                                 };
> >
> >                                 /* Or, free page */
> >                                 struct list_head buddy_list;
> >                                 struct list_head pcp_list;
> >                         };
> >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> >                         struct address_space *mapping;
> >                         union {
> >                                 pgoff_t index;          /* Our offset within mapping. */
> >                                 unsigned long share;    /* share count for fsdax */
> >                         };
> >                         /**
> >                          * @private: Mapping-private opaque data.
> >                          * Usually used for buffer_heads if PagePrivate.
> >                          * Used for swp_entry_t if PageSwapCache.
> >                          * Indicates order in the buddy system if PageBuddy.
> >                          */
> >                         unsigned long private;
> >                 };
> >
> > But within the page pool struct, we have a variable called
> > dma_addr that is appropriate for storing dma addr.
> > And that struct is used by netstack. That works to our advantage.
> >
> >                 struct {        /* page_pool used by netstack */
> >                         /**
> >                          * @pp_magic: magic value to avoid recycling non
> >                          * page_pool allocated pages.
> >                          */
> >                         unsigned long pp_magic;
> >                         struct page_pool *pp;
> >                         unsigned long _pp_mapping_pad;
> >                         unsigned long dma_addr;
> >                         atomic_long_t pp_ref_count;
> >                 };
> >
> > On the other side, we should use variables from the same sub-struct.
> > So this patch replaces the "private" with "pp".
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
>
> Instead of doing a customized version of page pool, can we simply
> switch to use page pool for big mode instead? Then we don't need to
> bother the dma stuffs.


The page pool needs to do the dma by the DMA APIs.
So we can not use the page pool directly.

Thanks.


>
> Thanks
>
Jason Wang April 12, 2024, 5:49 a.m. UTC | #3
On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > Now, we chain the pages of big mode by the page's private variable.
> > > But a subsequent patch aims to make the big mode to support
> > > premapped mode. This requires additional space to store the dma addr.
> > >
> > > Within the sub-struct that contains the 'private', there is no suitable
> > > variable for storing the DMA addr.
> > >
> > >                 struct {        /* Page cache and anonymous pages */
> > >                         /**
> > >                          * @lru: Pageout list, eg. active_list protected by
> > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > >                          * by the page owner.
> > >                          */
> > >                         union {
> > >                                 struct list_head lru;
> > >
> > >                                 /* Or, for the Unevictable "LRU list" slot */
> > >                                 struct {
> > >                                         /* Always even, to negate PageTail */
> > >                                         void *__filler;
> > >                                         /* Count page's or folio's mlocks */
> > >                                         unsigned int mlock_count;
> > >                                 };
> > >
> > >                                 /* Or, free page */
> > >                                 struct list_head buddy_list;
> > >                                 struct list_head pcp_list;
> > >                         };
> > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > >                         struct address_space *mapping;
> > >                         union {
> > >                                 pgoff_t index;          /* Our offset within mapping. */
> > >                                 unsigned long share;    /* share count for fsdax */
> > >                         };
> > >                         /**
> > >                          * @private: Mapping-private opaque data.
> > >                          * Usually used for buffer_heads if PagePrivate.
> > >                          * Used for swp_entry_t if PageSwapCache.
> > >                          * Indicates order in the buddy system if PageBuddy.
> > >                          */
> > >                         unsigned long private;
> > >                 };
> > >
> > > But within the page pool struct, we have a variable called
> > > dma_addr that is appropriate for storing dma addr.
> > > And that struct is used by netstack. That works to our advantage.
> > >
> > >                 struct {        /* page_pool used by netstack */
> > >                         /**
> > >                          * @pp_magic: magic value to avoid recycling non
> > >                          * page_pool allocated pages.
> > >                          */
> > >                         unsigned long pp_magic;
> > >                         struct page_pool *pp;
> > >                         unsigned long _pp_mapping_pad;
> > >                         unsigned long dma_addr;
> > >                         atomic_long_t pp_ref_count;
> > >                 };
> > >
> > > On the other side, we should use variables from the same sub-struct.
> > > So this patch replaces the "private" with "pp".
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> >
> > Instead of doing a customized version of page pool, can we simply
> > switch to use page pool for big mode instead? Then we don't need to
> > bother the dma stuffs.
>
>
> The page pool needs to do the dma by the DMA APIs.
> So we can not use the page pool directly.

I found this:

define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
                                        * map/unmap

It seems to work here?

Thanks

>
> Thanks.
>
>
> >
> > Thanks
> >
>
Xuan Zhuo April 12, 2024, 6:02 a.m. UTC | #4
On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > Now, we chain the pages of big mode by the page's private variable.
> > > > But a subsequent patch aims to make the big mode to support
> > > > premapped mode. This requires additional space to store the dma addr.
> > > >
> > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > variable for storing the DMA addr.
> > > >
> > > >                 struct {        /* Page cache and anonymous pages */
> > > >                         /**
> > > >                          * @lru: Pageout list, eg. active_list protected by
> > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > >                          * by the page owner.
> > > >                          */
> > > >                         union {
> > > >                                 struct list_head lru;
> > > >
> > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > >                                 struct {
> > > >                                         /* Always even, to negate PageTail */
> > > >                                         void *__filler;
> > > >                                         /* Count page's or folio's mlocks */
> > > >                                         unsigned int mlock_count;
> > > >                                 };
> > > >
> > > >                                 /* Or, free page */
> > > >                                 struct list_head buddy_list;
> > > >                                 struct list_head pcp_list;
> > > >                         };
> > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > >                         struct address_space *mapping;
> > > >                         union {
> > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > >                                 unsigned long share;    /* share count for fsdax */
> > > >                         };
> > > >                         /**
> > > >                          * @private: Mapping-private opaque data.
> > > >                          * Usually used for buffer_heads if PagePrivate.
> > > >                          * Used for swp_entry_t if PageSwapCache.
> > > >                          * Indicates order in the buddy system if PageBuddy.
> > > >                          */
> > > >                         unsigned long private;
> > > >                 };
> > > >
> > > > But within the page pool struct, we have a variable called
> > > > dma_addr that is appropriate for storing dma addr.
> > > > And that struct is used by netstack. That works to our advantage.
> > > >
> > > >                 struct {        /* page_pool used by netstack */
> > > >                         /**
> > > >                          * @pp_magic: magic value to avoid recycling non
> > > >                          * page_pool allocated pages.
> > > >                          */
> > > >                         unsigned long pp_magic;
> > > >                         struct page_pool *pp;
> > > >                         unsigned long _pp_mapping_pad;
> > > >                         unsigned long dma_addr;
> > > >                         atomic_long_t pp_ref_count;
> > > >                 };
> > > >
> > > > On the other side, we should use variables from the same sub-struct.
> > > > So this patch replaces the "private" with "pp".
> > > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > ---
> > >
> > > Instead of doing a customized version of page pool, can we simply
> > > switch to use page pool for big mode instead? Then we don't need to
> > > bother the dma stuffs.
> >
> >
> > The page pool needs to do the dma by the DMA APIs.
> > So we can not use the page pool directly.
>
> I found this:
>
> define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
>                                         * map/unmap

You are right. I missed this. I will try.

Thanks.


>
> It seems to work here?
>
> Thanks
>
> >
> > Thanks.
> >
> >
> > >
> > > Thanks
> > >
> >
>
Xuan Zhuo April 15, 2024, 2:08 a.m. UTC | #5
On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > Now, we chain the pages of big mode by the page's private variable.
> > > > But a subsequent patch aims to make the big mode to support
> > > > premapped mode. This requires additional space to store the dma addr.
> > > >
> > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > variable for storing the DMA addr.
> > > >
> > > >                 struct {        /* Page cache and anonymous pages */
> > > >                         /**
> > > >                          * @lru: Pageout list, eg. active_list protected by
> > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > >                          * by the page owner.
> > > >                          */
> > > >                         union {
> > > >                                 struct list_head lru;
> > > >
> > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > >                                 struct {
> > > >                                         /* Always even, to negate PageTail */
> > > >                                         void *__filler;
> > > >                                         /* Count page's or folio's mlocks */
> > > >                                         unsigned int mlock_count;
> > > >                                 };
> > > >
> > > >                                 /* Or, free page */
> > > >                                 struct list_head buddy_list;
> > > >                                 struct list_head pcp_list;
> > > >                         };
> > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > >                         struct address_space *mapping;
> > > >                         union {
> > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > >                                 unsigned long share;    /* share count for fsdax */
> > > >                         };
> > > >                         /**
> > > >                          * @private: Mapping-private opaque data.
> > > >                          * Usually used for buffer_heads if PagePrivate.
> > > >                          * Used for swp_entry_t if PageSwapCache.
> > > >                          * Indicates order in the buddy system if PageBuddy.
> > > >                          */
> > > >                         unsigned long private;
> > > >                 };
> > > >
> > > > But within the page pool struct, we have a variable called
> > > > dma_addr that is appropriate for storing dma addr.
> > > > And that struct is used by netstack. That works to our advantage.
> > > >
> > > >                 struct {        /* page_pool used by netstack */
> > > >                         /**
> > > >                          * @pp_magic: magic value to avoid recycling non
> > > >                          * page_pool allocated pages.
> > > >                          */
> > > >                         unsigned long pp_magic;
> > > >                         struct page_pool *pp;
> > > >                         unsigned long _pp_mapping_pad;
> > > >                         unsigned long dma_addr;
> > > >                         atomic_long_t pp_ref_count;
> > > >                 };
> > > >
> > > > On the other side, we should use variables from the same sub-struct.
> > > > So this patch replaces the "private" with "pp".
> > > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > ---
> > >
> > > Instead of doing a customized version of page pool, can we simply
> > > switch to use page pool for big mode instead? Then we don't need to
> > > bother the dma stuffs.
> >
> >
> > The page pool needs to do the dma by the DMA APIs.
> > So we can not use the page pool directly.
>
> I found this:
>
> define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
>                                         * map/unmap
>
> It seems to work here?


I have studied the page pool mechanism and believe that we cannot use it
directly. We can make the page pool to bypass the DMA operations.
This allows us to handle DMA within virtio-net for pages allocated from the page
pool. Furthermore, we can utilize page pool helpers to associate the DMA address
to the page.

However, the critical issue pertains to unmapping. Ideally, we want to return
the mapped pages to the page pool and reuse them. In doing so, we can omit the
unmapping and remapping steps.

Currently, there's a caveat: when the page pool cache is full, it disconnects
and releases the pages. When the pool hits its capacity, pages are relinquished
without a chance for unmapping. If we were to unmap pages each time before
returning them to the pool, we would negate the benefits of bypassing the
mapping and unmapping process altogether.

Thanks.



>
> Thanks
>
> >
> > Thanks.
> >
> >
> > >
> > > Thanks
> > >
> >
>
Jason Wang April 15, 2024, 6:43 a.m. UTC | #6
On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > But a subsequent patch aims to make the big mode to support
> > > > > premapped mode. This requires additional space to store the dma addr.
> > > > >
> > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > variable for storing the DMA addr.
> > > > >
> > > > >                 struct {        /* Page cache and anonymous pages */
> > > > >                         /**
> > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > >                          * by the page owner.
> > > > >                          */
> > > > >                         union {
> > > > >                                 struct list_head lru;
> > > > >
> > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > >                                 struct {
> > > > >                                         /* Always even, to negate PageTail */
> > > > >                                         void *__filler;
> > > > >                                         /* Count page's or folio's mlocks */
> > > > >                                         unsigned int mlock_count;
> > > > >                                 };
> > > > >
> > > > >                                 /* Or, free page */
> > > > >                                 struct list_head buddy_list;
> > > > >                                 struct list_head pcp_list;
> > > > >                         };
> > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > >                         struct address_space *mapping;
> > > > >                         union {
> > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > >                         };
> > > > >                         /**
> > > > >                          * @private: Mapping-private opaque data.
> > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > >                          */
> > > > >                         unsigned long private;
> > > > >                 };
> > > > >
> > > > > But within the page pool struct, we have a variable called
> > > > > dma_addr that is appropriate for storing dma addr.
> > > > > And that struct is used by netstack. That works to our advantage.
> > > > >
> > > > >                 struct {        /* page_pool used by netstack */
> > > > >                         /**
> > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > >                          * page_pool allocated pages.
> > > > >                          */
> > > > >                         unsigned long pp_magic;
> > > > >                         struct page_pool *pp;
> > > > >                         unsigned long _pp_mapping_pad;
> > > > >                         unsigned long dma_addr;
> > > > >                         atomic_long_t pp_ref_count;
> > > > >                 };
> > > > >
> > > > > On the other side, we should use variables from the same sub-struct.
> > > > > So this patch replaces the "private" with "pp".
> > > > >
> > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > ---
> > > >
> > > > Instead of doing a customized version of page pool, can we simply
> > > > switch to use page pool for big mode instead? Then we don't need to
> > > > bother the dma stuffs.
> > >
> > >
> > > The page pool needs to do the dma by the DMA APIs.
> > > So we can not use the page pool directly.
> >
> > I found this:
> >
> > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> >                                         * map/unmap
> >
> > It seems to work here?
>
>
> I have studied the page pool mechanism and believe that we cannot use it
> directly. We can make the page pool to bypass the DMA operations.
> This allows us to handle DMA within virtio-net for pages allocated from the page
> pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> to the page.
>
> However, the critical issue pertains to unmapping. Ideally, we want to return
> the mapped pages to the page pool and reuse them. In doing so, we can omit the
> unmapping and remapping steps.
>
> Currently, there's a caveat: when the page pool cache is full, it disconnects
> and releases the pages. When the pool hits its capacity, pages are relinquished
> without a chance for unmapping.

Technically, when ptr_ring is full there could be a fallback, but then
it requires expensive synchronization between producer and consumer.
For virtio-net, it might not be a problem because add/get has been
synchronized. (It might be relaxed in the future, actually we've
already seen a requirement in the past for virito-blk).

> If we were to unmap pages each time before
> returning them to the pool, we would negate the benefits of bypassing the
> mapping and unmapping process altogether.

Yes, but the problem in this approach is that it creates a corner
exception where dma_addr is used outside the page pool.

Maybe for big mode it doesn't matter too much if there's no
performance improvement.

Thanks

>
> Thanks.
>
>
>
> >
> > Thanks
> >
> > >
> > > Thanks.
> > >
> > >
> > > >
> > > > Thanks
> > > >
> > >
> >
>
Xuan Zhuo April 15, 2024, 8:36 a.m. UTC | #7
On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > >
> > > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > > But a subsequent patch aims to make the big mode to support
> > > > > > premapped mode. This requires additional space to store the dma addr.
> > > > > >
> > > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > > variable for storing the DMA addr.
> > > > > >
> > > > > >                 struct {        /* Page cache and anonymous pages */
> > > > > >                         /**
> > > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > > >                          * by the page owner.
> > > > > >                          */
> > > > > >                         union {
> > > > > >                                 struct list_head lru;
> > > > > >
> > > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > > >                                 struct {
> > > > > >                                         /* Always even, to negate PageTail */
> > > > > >                                         void *__filler;
> > > > > >                                         /* Count page's or folio's mlocks */
> > > > > >                                         unsigned int mlock_count;
> > > > > >                                 };
> > > > > >
> > > > > >                                 /* Or, free page */
> > > > > >                                 struct list_head buddy_list;
> > > > > >                                 struct list_head pcp_list;
> > > > > >                         };
> > > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > > >                         struct address_space *mapping;
> > > > > >                         union {
> > > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > > >                         };
> > > > > >                         /**
> > > > > >                          * @private: Mapping-private opaque data.
> > > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > > >                          */
> > > > > >                         unsigned long private;
> > > > > >                 };
> > > > > >
> > > > > > But within the page pool struct, we have a variable called
> > > > > > dma_addr that is appropriate for storing dma addr.
> > > > > > And that struct is used by netstack. That works to our advantage.
> > > > > >
> > > > > >                 struct {        /* page_pool used by netstack */
> > > > > >                         /**
> > > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > > >                          * page_pool allocated pages.
> > > > > >                          */
> > > > > >                         unsigned long pp_magic;
> > > > > >                         struct page_pool *pp;
> > > > > >                         unsigned long _pp_mapping_pad;
> > > > > >                         unsigned long dma_addr;
> > > > > >                         atomic_long_t pp_ref_count;
> > > > > >                 };
> > > > > >
> > > > > > On the other side, we should use variables from the same sub-struct.
> > > > > > So this patch replaces the "private" with "pp".
> > > > > >
> > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > ---
> > > > >
> > > > > Instead of doing a customized version of page pool, can we simply
> > > > > switch to use page pool for big mode instead? Then we don't need to
> > > > > bother the dma stuffs.
> > > >
> > > >
> > > > The page pool needs to do the dma by the DMA APIs.
> > > > So we can not use the page pool directly.
> > >
> > > I found this:
> > >
> > > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> > >                                         * map/unmap
> > >
> > > It seems to work here?
> >
> >
> > I have studied the page pool mechanism and believe that we cannot use it
> > directly. We can make the page pool to bypass the DMA operations.
> > This allows us to handle DMA within virtio-net for pages allocated from the page
> > pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> > to the page.
> >
> > However, the critical issue pertains to unmapping. Ideally, we want to return
> > the mapped pages to the page pool and reuse them. In doing so, we can omit the
> > unmapping and remapping steps.
> >
> > Currently, there's a caveat: when the page pool cache is full, it disconnects
> > and releases the pages. When the pool hits its capacity, pages are relinquished
> > without a chance for unmapping.
>
> Technically, when ptr_ring is full there could be a fallback, but then
> it requires expensive synchronization between producer and consumer.
> For virtio-net, it might not be a problem because add/get has been
> synchronized. (It might be relaxed in the future, actually we've
> already seen a requirement in the past for virito-blk).

The point is that the page will be released by page pool directly,
we will have no change to unmap that, if we work with page pool.

>
> > If we were to unmap pages each time before
> > returning them to the pool, we would negate the benefits of bypassing the
> > mapping and unmapping process altogether.
>
> Yes, but the problem in this approach is that it creates a corner
> exception where dma_addr is used outside the page pool.

YES. This is a corner exception. We need to introduce this case to the page
pool.

So for introducing the page-pool to virtio-net(not only for big mode),
we may need to push the page-pool to support dma by drivers.

Back to this patch set, I think we should keep the virtio-net to manage
the pages.

What do you think?

Thanks

>
> Maybe for big mode it doesn't matter too much if there's no
> performance improvement.
>
> Thanks
>
> >
> > Thanks.
> >
> >
> >
> > >
> > > Thanks
> > >
> > > >
> > > > Thanks.
> > > >
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > >
> > >
> >
>
Jason Wang April 15, 2024, 8:56 a.m. UTC | #8
On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > >
> > > > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > > > But a subsequent patch aims to make the big mode to support
> > > > > > > premapped mode. This requires additional space to store the dma addr.
> > > > > > >
> > > > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > > > variable for storing the DMA addr.
> > > > > > >
> > > > > > >                 struct {        /* Page cache and anonymous pages */
> > > > > > >                         /**
> > > > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > > > >                          * by the page owner.
> > > > > > >                          */
> > > > > > >                         union {
> > > > > > >                                 struct list_head lru;
> > > > > > >
> > > > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > > > >                                 struct {
> > > > > > >                                         /* Always even, to negate PageTail */
> > > > > > >                                         void *__filler;
> > > > > > >                                         /* Count page's or folio's mlocks */
> > > > > > >                                         unsigned int mlock_count;
> > > > > > >                                 };
> > > > > > >
> > > > > > >                                 /* Or, free page */
> > > > > > >                                 struct list_head buddy_list;
> > > > > > >                                 struct list_head pcp_list;
> > > > > > >                         };
> > > > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > > > >                         struct address_space *mapping;
> > > > > > >                         union {
> > > > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > > > >                         };
> > > > > > >                         /**
> > > > > > >                          * @private: Mapping-private opaque data.
> > > > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > > > >                          */
> > > > > > >                         unsigned long private;
> > > > > > >                 };
> > > > > > >
> > > > > > > But within the page pool struct, we have a variable called
> > > > > > > dma_addr that is appropriate for storing dma addr.
> > > > > > > And that struct is used by netstack. That works to our advantage.
> > > > > > >
> > > > > > >                 struct {        /* page_pool used by netstack */
> > > > > > >                         /**
> > > > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > > > >                          * page_pool allocated pages.
> > > > > > >                          */
> > > > > > >                         unsigned long pp_magic;
> > > > > > >                         struct page_pool *pp;
> > > > > > >                         unsigned long _pp_mapping_pad;
> > > > > > >                         unsigned long dma_addr;
> > > > > > >                         atomic_long_t pp_ref_count;
> > > > > > >                 };
> > > > > > >
> > > > > > > On the other side, we should use variables from the same sub-struct.
> > > > > > > So this patch replaces the "private" with "pp".
> > > > > > >
> > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > ---
> > > > > >
> > > > > > Instead of doing a customized version of page pool, can we simply
> > > > > > switch to use page pool for big mode instead? Then we don't need to
> > > > > > bother the dma stuffs.
> > > > >
> > > > >
> > > > > The page pool needs to do the dma by the DMA APIs.
> > > > > So we can not use the page pool directly.
> > > >
> > > > I found this:
> > > >
> > > > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> > > >                                         * map/unmap
> > > >
> > > > It seems to work here?
> > >
> > >
> > > I have studied the page pool mechanism and believe that we cannot use it
> > > directly. We can make the page pool to bypass the DMA operations.
> > > This allows us to handle DMA within virtio-net for pages allocated from the page
> > > pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> > > to the page.
> > >
> > > However, the critical issue pertains to unmapping. Ideally, we want to return
> > > the mapped pages to the page pool and reuse them. In doing so, we can omit the
> > > unmapping and remapping steps.
> > >
> > > Currently, there's a caveat: when the page pool cache is full, it disconnects
> > > and releases the pages. When the pool hits its capacity, pages are relinquished
> > > without a chance for unmapping.
> >
> > Technically, when ptr_ring is full there could be a fallback, but then
> > it requires expensive synchronization between producer and consumer.
> > For virtio-net, it might not be a problem because add/get has been
> > synchronized. (It might be relaxed in the future, actually we've
> > already seen a requirement in the past for virito-blk).
>
> The point is that the page will be released by page pool directly,
> we will have no change to unmap that, if we work with page pool.

I mean if we have a fallback, there would be no need to release these
pages but put them into a link list.

>
> >
> > > If we were to unmap pages each time before
> > > returning them to the pool, we would negate the benefits of bypassing the
> > > mapping and unmapping process altogether.
> >
> > Yes, but the problem in this approach is that it creates a corner
> > exception where dma_addr is used outside the page pool.
>
> YES. This is a corner exception. We need to introduce this case to the page
> pool.
>
> So for introducing the page-pool to virtio-net(not only for big mode),
> we may need to push the page-pool to support dma by drivers.

Adding Jesper for some comments.

>
> Back to this patch set, I think we should keep the virtio-net to manage
> the pages.
>
> What do you think?

I might be wrong, but I think if we need to either

1) seek a way to manage the pages by yourself but not touching page
pool metadata (or Jesper is fine with this)
2) optimize the unmap for page pool

or even

3) just do dma_unmap before returning the page back to the page pool,
we don't get all the benefits of page pool but we end up with simple
codes (no fallback for premapping).

Thanks


>
> Thanks
>
> >
> > Maybe for big mode it doesn't matter too much if there's no
> > performance improvement.
> >
> > Thanks
> >
> > >
> > > Thanks.
> > >
> > >
> > >
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > Thanks.
> > > > >
> > > > >
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > >
> > > >
> > >
> >
>
Xuan Zhuo April 15, 2024, 8:59 a.m. UTC | #9
On Mon, 15 Apr 2024 16:56:45 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > >
> > > > > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > >
> > > > > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > > > > But a subsequent patch aims to make the big mode to support
> > > > > > > > premapped mode. This requires additional space to store the dma addr.
> > > > > > > >
> > > > > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > > > > variable for storing the DMA addr.
> > > > > > > >
> > > > > > > >                 struct {        /* Page cache and anonymous pages */
> > > > > > > >                         /**
> > > > > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > > > > >                          * by the page owner.
> > > > > > > >                          */
> > > > > > > >                         union {
> > > > > > > >                                 struct list_head lru;
> > > > > > > >
> > > > > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > > > > >                                 struct {
> > > > > > > >                                         /* Always even, to negate PageTail */
> > > > > > > >                                         void *__filler;
> > > > > > > >                                         /* Count page's or folio's mlocks */
> > > > > > > >                                         unsigned int mlock_count;
> > > > > > > >                                 };
> > > > > > > >
> > > > > > > >                                 /* Or, free page */
> > > > > > > >                                 struct list_head buddy_list;
> > > > > > > >                                 struct list_head pcp_list;
> > > > > > > >                         };
> > > > > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > > > > >                         struct address_space *mapping;
> > > > > > > >                         union {
> > > > > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > > > > >                         };
> > > > > > > >                         /**
> > > > > > > >                          * @private: Mapping-private opaque data.
> > > > > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > > > > >                          */
> > > > > > > >                         unsigned long private;
> > > > > > > >                 };
> > > > > > > >
> > > > > > > > But within the page pool struct, we have a variable called
> > > > > > > > dma_addr that is appropriate for storing dma addr.
> > > > > > > > And that struct is used by netstack. That works to our advantage.
> > > > > > > >
> > > > > > > >                 struct {        /* page_pool used by netstack */
> > > > > > > >                         /**
> > > > > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > > > > >                          * page_pool allocated pages.
> > > > > > > >                          */
> > > > > > > >                         unsigned long pp_magic;
> > > > > > > >                         struct page_pool *pp;
> > > > > > > >                         unsigned long _pp_mapping_pad;
> > > > > > > >                         unsigned long dma_addr;
> > > > > > > >                         atomic_long_t pp_ref_count;
> > > > > > > >                 };
> > > > > > > >
> > > > > > > > On the other side, we should use variables from the same sub-struct.
> > > > > > > > So this patch replaces the "private" with "pp".
> > > > > > > >
> > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > ---
> > > > > > >
> > > > > > > Instead of doing a customized version of page pool, can we simply
> > > > > > > switch to use page pool for big mode instead? Then we don't need to
> > > > > > > bother the dma stuffs.
> > > > > >
> > > > > >
> > > > > > The page pool needs to do the dma by the DMA APIs.
> > > > > > So we can not use the page pool directly.
> > > > >
> > > > > I found this:
> > > > >
> > > > > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> > > > >                                         * map/unmap
> > > > >
> > > > > It seems to work here?
> > > >
> > > >
> > > > I have studied the page pool mechanism and believe that we cannot use it
> > > > directly. We can make the page pool to bypass the DMA operations.
> > > > This allows us to handle DMA within virtio-net for pages allocated from the page
> > > > pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> > > > to the page.
> > > >
> > > > However, the critical issue pertains to unmapping. Ideally, we want to return
> > > > the mapped pages to the page pool and reuse them. In doing so, we can omit the
> > > > unmapping and remapping steps.
> > > >
> > > > Currently, there's a caveat: when the page pool cache is full, it disconnects
> > > > and releases the pages. When the pool hits its capacity, pages are relinquished
> > > > without a chance for unmapping.
> > >
> > > Technically, when ptr_ring is full there could be a fallback, but then
> > > it requires expensive synchronization between producer and consumer.
> > > For virtio-net, it might not be a problem because add/get has been
> > > synchronized. (It might be relaxed in the future, actually we've
> > > already seen a requirement in the past for virito-blk).
> >
> > The point is that the page will be released by page pool directly,
> > we will have no change to unmap that, if we work with page pool.
>
> I mean if we have a fallback, there would be no need to release these
> pages but put them into a link list.


What fallback?

If we put the pages to the link list, why we use the page pool?


>
> >
> > >
> > > > If we were to unmap pages each time before
> > > > returning them to the pool, we would negate the benefits of bypassing the
> > > > mapping and unmapping process altogether.
> > >
> > > Yes, but the problem in this approach is that it creates a corner
> > > exception where dma_addr is used outside the page pool.
> >
> > YES. This is a corner exception. We need to introduce this case to the page
> > pool.
> >
> > So for introducing the page-pool to virtio-net(not only for big mode),
> > we may need to push the page-pool to support dma by drivers.
>
> Adding Jesper for some comments.
>
> >
> > Back to this patch set, I think we should keep the virtio-net to manage
> > the pages.
> >
> > What do you think?
>
> I might be wrong, but I think if we need to either
>
> 1) seek a way to manage the pages by yourself but not touching page
> pool metadata (or Jesper is fine with this)

Do you mean working with page pool or not?

If we manage the pages by self(no page pool), we do not care the metadata is for
page pool or not. We just use the space of pages like the "private".


> 2) optimize the unmap for page pool
>
> or even
>
> 3) just do dma_unmap before returning the page back to the page pool,
> we don't get all the benefits of page pool but we end up with simple
> codes (no fallback for premapping).

I am ok for this.


Thanks.

>
> Thanks
>
>
> >
> > Thanks
> >
> > >
> > > Maybe for big mode it doesn't matter too much if there's no
> > > performance improvement.
> > >
> > > Thanks
> > >
> > > >
> > > > Thanks.
> > > >
> > > >
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>
Jason Wang April 16, 2024, 3:24 a.m. UTC | #10
On Mon, Apr 15, 2024 at 5:04 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Mon, 15 Apr 2024 16:56:45 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > > > On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > >
> > > > > > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > >
> > > > > > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > > > > > But a subsequent patch aims to make the big mode to support
> > > > > > > > > premapped mode. This requires additional space to store the dma addr.
> > > > > > > > >
> > > > > > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > > > > > variable for storing the DMA addr.
> > > > > > > > >
> > > > > > > > >                 struct {        /* Page cache and anonymous pages */
> > > > > > > > >                         /**
> > > > > > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > > > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > > > > > >                          * by the page owner.
> > > > > > > > >                          */
> > > > > > > > >                         union {
> > > > > > > > >                                 struct list_head lru;
> > > > > > > > >
> > > > > > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > > > > > >                                 struct {
> > > > > > > > >                                         /* Always even, to negate PageTail */
> > > > > > > > >                                         void *__filler;
> > > > > > > > >                                         /* Count page's or folio's mlocks */
> > > > > > > > >                                         unsigned int mlock_count;
> > > > > > > > >                                 };
> > > > > > > > >
> > > > > > > > >                                 /* Or, free page */
> > > > > > > > >                                 struct list_head buddy_list;
> > > > > > > > >                                 struct list_head pcp_list;
> > > > > > > > >                         };
> > > > > > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > > > > > >                         struct address_space *mapping;
> > > > > > > > >                         union {
> > > > > > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > > > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > > > > > >                         };
> > > > > > > > >                         /**
> > > > > > > > >                          * @private: Mapping-private opaque data.
> > > > > > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > > > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > > > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > > > > > >                          */
> > > > > > > > >                         unsigned long private;
> > > > > > > > >                 };
> > > > > > > > >
> > > > > > > > > But within the page pool struct, we have a variable called
> > > > > > > > > dma_addr that is appropriate for storing dma addr.
> > > > > > > > > And that struct is used by netstack. That works to our advantage.
> > > > > > > > >
> > > > > > > > >                 struct {        /* page_pool used by netstack */
> > > > > > > > >                         /**
> > > > > > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > > > > > >                          * page_pool allocated pages.
> > > > > > > > >                          */
> > > > > > > > >                         unsigned long pp_magic;
> > > > > > > > >                         struct page_pool *pp;
> > > > > > > > >                         unsigned long _pp_mapping_pad;
> > > > > > > > >                         unsigned long dma_addr;
> > > > > > > > >                         atomic_long_t pp_ref_count;
> > > > > > > > >                 };
> > > > > > > > >
> > > > > > > > > On the other side, we should use variables from the same sub-struct.
> > > > > > > > > So this patch replaces the "private" with "pp".
> > > > > > > > >
> > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > ---
> > > > > > > >
> > > > > > > > Instead of doing a customized version of page pool, can we simply
> > > > > > > > switch to use page pool for big mode instead? Then we don't need to
> > > > > > > > bother the dma stuffs.
> > > > > > >
> > > > > > >
> > > > > > > The page pool needs to do the dma by the DMA APIs.
> > > > > > > So we can not use the page pool directly.
> > > > > >
> > > > > > I found this:
> > > > > >
> > > > > > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> > > > > >                                         * map/unmap
> > > > > >
> > > > > > It seems to work here?
> > > > >
> > > > >
> > > > > I have studied the page pool mechanism and believe that we cannot use it
> > > > > directly. We can make the page pool to bypass the DMA operations.
> > > > > This allows us to handle DMA within virtio-net for pages allocated from the page
> > > > > pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> > > > > to the page.
> > > > >
> > > > > However, the critical issue pertains to unmapping. Ideally, we want to return
> > > > > the mapped pages to the page pool and reuse them. In doing so, we can omit the
> > > > > unmapping and remapping steps.
> > > > >
> > > > > Currently, there's a caveat: when the page pool cache is full, it disconnects
> > > > > and releases the pages. When the pool hits its capacity, pages are relinquished
> > > > > without a chance for unmapping.
> > > >
> > > > Technically, when ptr_ring is full there could be a fallback, but then
> > > > it requires expensive synchronization between producer and consumer.
> > > > For virtio-net, it might not be a problem because add/get has been
> > > > synchronized. (It might be relaxed in the future, actually we've
> > > > already seen a requirement in the past for virito-blk).
> > >
> > > The point is that the page will be released by page pool directly,
> > > we will have no change to unmap that, if we work with page pool.
> >
> > I mean if we have a fallback, there would be no need to release these
> > pages but put them into a link list.
>
>
> What fallback?

https://lore.kernel.org/netdev/1519607771-20613-1-git-send-email-mst@redhat.com/

>
> If we put the pages to the link list, why we use the page pool?

The size of the cache and ptr_ring needs to be fixed.

Again, as explained above, it needs more benchmarks and looks like a
separate topic.

>
>
> >
> > >
> > > >
> > > > > If we were to unmap pages each time before
> > > > > returning them to the pool, we would negate the benefits of bypassing the
> > > > > mapping and unmapping process altogether.
> > > >
> > > > Yes, but the problem in this approach is that it creates a corner
> > > > exception where dma_addr is used outside the page pool.
> > >
> > > YES. This is a corner exception. We need to introduce this case to the page
> > > pool.
> > >
> > > So for introducing the page-pool to virtio-net(not only for big mode),
> > > we may need to push the page-pool to support dma by drivers.
> >
> > Adding Jesper for some comments.
> >
> > >
> > > Back to this patch set, I think we should keep the virtio-net to manage
> > > the pages.
> > >
> > > What do you think?
> >
> > I might be wrong, but I think if we need to either
> >
> > 1) seek a way to manage the pages by yourself but not touching page
> > pool metadata (or Jesper is fine with this)
>
> Do you mean working with page pool or not?
>

I meant if Jesper is fine with reusing page pool metadata like this patch.

> If we manage the pages by self(no page pool), we do not care the metadata is for
> page pool or not. We just use the space of pages like the "private".

That's also fine.

>
>
> > 2) optimize the unmap for page pool
> >
> > or even
> >
> > 3) just do dma_unmap before returning the page back to the page pool,
> > we don't get all the benefits of page pool but we end up with simple
> > codes (no fallback for premapping).
>
> I am ok for this.

Right, we just need to make sure there's no performance regression,
then it would be fine.

I see for example mana did this as well.

Thanks

>
>
> Thanks.
>
> >
> > Thanks
> >
> >
> > >
> > > Thanks
> > >
> > > >
> > > > Maybe for big mode it doesn't matter too much if there's no
> > > > performance improvement.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > Thanks.
> > > > >
> > > > >
> > > > >
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>
Xuan Zhuo April 17, 2024, 1:30 a.m. UTC | #11
On Tue, 16 Apr 2024 11:24:53 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Mon, Apr 15, 2024 at 5:04 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Mon, 15 Apr 2024 16:56:45 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > >
> > > > > > On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > >
> > > > > > > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > > > > > > But a subsequent patch aims to make the big mode to support
> > > > > > > > > > premapped mode. This requires additional space to store the dma addr.
> > > > > > > > > >
> > > > > > > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > > > > > > variable for storing the DMA addr.
> > > > > > > > > >
> > > > > > > > > >                 struct {        /* Page cache and anonymous pages */
> > > > > > > > > >                         /**
> > > > > > > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > > > > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > > > > > > >                          * by the page owner.
> > > > > > > > > >                          */
> > > > > > > > > >                         union {
> > > > > > > > > >                                 struct list_head lru;
> > > > > > > > > >
> > > > > > > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > > > > > > >                                 struct {
> > > > > > > > > >                                         /* Always even, to negate PageTail */
> > > > > > > > > >                                         void *__filler;
> > > > > > > > > >                                         /* Count page's or folio's mlocks */
> > > > > > > > > >                                         unsigned int mlock_count;
> > > > > > > > > >                                 };
> > > > > > > > > >
> > > > > > > > > >                                 /* Or, free page */
> > > > > > > > > >                                 struct list_head buddy_list;
> > > > > > > > > >                                 struct list_head pcp_list;
> > > > > > > > > >                         };
> > > > > > > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > > > > > > >                         struct address_space *mapping;
> > > > > > > > > >                         union {
> > > > > > > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > > > > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > > > > > > >                         };
> > > > > > > > > >                         /**
> > > > > > > > > >                          * @private: Mapping-private opaque data.
> > > > > > > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > > > > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > > > > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > > > > > > >                          */
> > > > > > > > > >                         unsigned long private;
> > > > > > > > > >                 };
> > > > > > > > > >
> > > > > > > > > > But within the page pool struct, we have a variable called
> > > > > > > > > > dma_addr that is appropriate for storing dma addr.
> > > > > > > > > > And that struct is used by netstack. That works to our advantage.
> > > > > > > > > >
> > > > > > > > > >                 struct {        /* page_pool used by netstack */
> > > > > > > > > >                         /**
> > > > > > > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > > > > > > >                          * page_pool allocated pages.
> > > > > > > > > >                          */
> > > > > > > > > >                         unsigned long pp_magic;
> > > > > > > > > >                         struct page_pool *pp;
> > > > > > > > > >                         unsigned long _pp_mapping_pad;
> > > > > > > > > >                         unsigned long dma_addr;
> > > > > > > > > >                         atomic_long_t pp_ref_count;
> > > > > > > > > >                 };
> > > > > > > > > >
> > > > > > > > > > On the other side, we should use variables from the same sub-struct.
> > > > > > > > > > So this patch replaces the "private" with "pp".
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > ---
> > > > > > > > >
> > > > > > > > > Instead of doing a customized version of page pool, can we simply
> > > > > > > > > switch to use page pool for big mode instead? Then we don't need to
> > > > > > > > > bother the dma stuffs.
> > > > > > > >
> > > > > > > >
> > > > > > > > The page pool needs to do the dma by the DMA APIs.
> > > > > > > > So we can not use the page pool directly.
> > > > > > >
> > > > > > > I found this:
> > > > > > >
> > > > > > > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> > > > > > >                                         * map/unmap
> > > > > > >
> > > > > > > It seems to work here?
> > > > > >
> > > > > >
> > > > > > I have studied the page pool mechanism and believe that we cannot use it
> > > > > > directly. We can make the page pool to bypass the DMA operations.
> > > > > > This allows us to handle DMA within virtio-net for pages allocated from the page
> > > > > > pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> > > > > > to the page.
> > > > > >
> > > > > > However, the critical issue pertains to unmapping. Ideally, we want to return
> > > > > > the mapped pages to the page pool and reuse them. In doing so, we can omit the
> > > > > > unmapping and remapping steps.
> > > > > >
> > > > > > Currently, there's a caveat: when the page pool cache is full, it disconnects
> > > > > > and releases the pages. When the pool hits its capacity, pages are relinquished
> > > > > > without a chance for unmapping.
> > > > >
> > > > > Technically, when ptr_ring is full there could be a fallback, but then
> > > > > it requires expensive synchronization between producer and consumer.
> > > > > For virtio-net, it might not be a problem because add/get has been
> > > > > synchronized. (It might be relaxed in the future, actually we've
> > > > > already seen a requirement in the past for virito-blk).
> > > >
> > > > The point is that the page will be released by page pool directly,
> > > > we will have no change to unmap that, if we work with page pool.
> > >
> > > I mean if we have a fallback, there would be no need to release these
> > > pages but put them into a link list.
> >
> >
> > What fallback?
>
> https://lore.kernel.org/netdev/1519607771-20613-1-git-send-email-mst@redhat.com/
>
> >
> > If we put the pages to the link list, why we use the page pool?
>
> The size of the cache and ptr_ring needs to be fixed.
>
> Again, as explained above, it needs more benchmarks and looks like a
> separate topic.
>
> >
> >
> > >
> > > >
> > > > >
> > > > > > If we were to unmap pages each time before
> > > > > > returning them to the pool, we would negate the benefits of bypassing the
> > > > > > mapping and unmapping process altogether.
> > > > >
> > > > > Yes, but the problem in this approach is that it creates a corner
> > > > > exception where dma_addr is used outside the page pool.
> > > >
> > > > YES. This is a corner exception. We need to introduce this case to the page
> > > > pool.
> > > >
> > > > So for introducing the page-pool to virtio-net(not only for big mode),
> > > > we may need to push the page-pool to support dma by drivers.
> > >
> > > Adding Jesper for some comments.
> > >
> > > >
> > > > Back to this patch set, I think we should keep the virtio-net to manage
> > > > the pages.
> > > >
> > > > What do you think?
> > >
> > > I might be wrong, but I think if we need to either
> > >
> > > 1) seek a way to manage the pages by yourself but not touching page
> > > pool metadata (or Jesper is fine with this)
> >
> > Do you mean working with page pool or not?
> >
>
> I meant if Jesper is fine with reusing page pool metadata like this patch.
>
> > If we manage the pages by self(no page pool), we do not care the metadata is for
> > page pool or not. We just use the space of pages like the "private".
>
> That's also fine.
>
> >
> >
> > > 2) optimize the unmap for page pool
> > >
> > > or even
> > >
> > > 3) just do dma_unmap before returning the page back to the page pool,
> > > we don't get all the benefits of page pool but we end up with simple
> > > codes (no fallback for premapping).
> >
> > I am ok for this.
>
> Right, we just need to make sure there's no performance regression,
> then it would be fine.
>
> I see for example mana did this as well.

I think we should not use page pool directly now,
because the mana does not need a space to store the dma address.
We need to store the dma address for unmapping.

If we use page pool without PP_FLAG_DMA_MAP, then store the dma address by
page.dma_addr, I think that is not safe.

I think the way of this patch set is fine. We just use the
space of the page whatever it is page pool or not to store
the link and dma address.

Thanks.

>
> Thanks
>
> >
> >
> > Thanks.
> >
> > >
> > > Thanks
> > >
> > >
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > Maybe for big mode it doesn't matter too much if there's no
> > > > > performance improvement.
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>
>
Jason Wang April 17, 2024, 4:08 a.m. UTC | #12
On Wed, Apr 17, 2024 at 9:38 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Tue, 16 Apr 2024 11:24:53 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Mon, Apr 15, 2024 at 5:04 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Mon, 15 Apr 2024 16:56:45 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > > > On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > >
> > > > > > > On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > > > > > > > But a subsequent patch aims to make the big mode to support
> > > > > > > > > > > premapped mode. This requires additional space to store the dma addr.
> > > > > > > > > > >
> > > > > > > > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > > > > > > > variable for storing the DMA addr.
> > > > > > > > > > >
> > > > > > > > > > >                 struct {        /* Page cache and anonymous pages */
> > > > > > > > > > >                         /**
> > > > > > > > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > > > > > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > > > > > > > >                          * by the page owner.
> > > > > > > > > > >                          */
> > > > > > > > > > >                         union {
> > > > > > > > > > >                                 struct list_head lru;
> > > > > > > > > > >
> > > > > > > > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > > > > > > > >                                 struct {
> > > > > > > > > > >                                         /* Always even, to negate PageTail */
> > > > > > > > > > >                                         void *__filler;
> > > > > > > > > > >                                         /* Count page's or folio's mlocks */
> > > > > > > > > > >                                         unsigned int mlock_count;
> > > > > > > > > > >                                 };
> > > > > > > > > > >
> > > > > > > > > > >                                 /* Or, free page */
> > > > > > > > > > >                                 struct list_head buddy_list;
> > > > > > > > > > >                                 struct list_head pcp_list;
> > > > > > > > > > >                         };
> > > > > > > > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > > > > > > > >                         struct address_space *mapping;
> > > > > > > > > > >                         union {
> > > > > > > > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > > > > > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > > > > > > > >                         };
> > > > > > > > > > >                         /**
> > > > > > > > > > >                          * @private: Mapping-private opaque data.
> > > > > > > > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > > > > > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > > > > > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > > > > > > > >                          */
> > > > > > > > > > >                         unsigned long private;
> > > > > > > > > > >                 };
> > > > > > > > > > >
> > > > > > > > > > > But within the page pool struct, we have a variable called
> > > > > > > > > > > dma_addr that is appropriate for storing dma addr.
> > > > > > > > > > > And that struct is used by netstack. That works to our advantage.
> > > > > > > > > > >
> > > > > > > > > > >                 struct {        /* page_pool used by netstack */
> > > > > > > > > > >                         /**
> > > > > > > > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > > > > > > > >                          * page_pool allocated pages.
> > > > > > > > > > >                          */
> > > > > > > > > > >                         unsigned long pp_magic;
> > > > > > > > > > >                         struct page_pool *pp;
> > > > > > > > > > >                         unsigned long _pp_mapping_pad;
> > > > > > > > > > >                         unsigned long dma_addr;
> > > > > > > > > > >                         atomic_long_t pp_ref_count;
> > > > > > > > > > >                 };
> > > > > > > > > > >
> > > > > > > > > > > On the other side, we should use variables from the same sub-struct.
> > > > > > > > > > > So this patch replaces the "private" with "pp".
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > ---
> > > > > > > > > >
> > > > > > > > > > Instead of doing a customized version of page pool, can we simply
> > > > > > > > > > switch to use page pool for big mode instead? Then we don't need to
> > > > > > > > > > bother the dma stuffs.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > The page pool needs to do the dma by the DMA APIs.
> > > > > > > > > So we can not use the page pool directly.
> > > > > > > >
> > > > > > > > I found this:
> > > > > > > >
> > > > > > > > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> > > > > > > >                                         * map/unmap
> > > > > > > >
> > > > > > > > It seems to work here?
> > > > > > >
> > > > > > >
> > > > > > > I have studied the page pool mechanism and believe that we cannot use it
> > > > > > > directly. We can make the page pool to bypass the DMA operations.
> > > > > > > This allows us to handle DMA within virtio-net for pages allocated from the page
> > > > > > > pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> > > > > > > to the page.
> > > > > > >
> > > > > > > However, the critical issue pertains to unmapping. Ideally, we want to return
> > > > > > > the mapped pages to the page pool and reuse them. In doing so, we can omit the
> > > > > > > unmapping and remapping steps.
> > > > > > >
> > > > > > > Currently, there's a caveat: when the page pool cache is full, it disconnects
> > > > > > > and releases the pages. When the pool hits its capacity, pages are relinquished
> > > > > > > without a chance for unmapping.
> > > > > >
> > > > > > Technically, when ptr_ring is full there could be a fallback, but then
> > > > > > it requires expensive synchronization between producer and consumer.
> > > > > > For virtio-net, it might not be a problem because add/get has been
> > > > > > synchronized. (It might be relaxed in the future, actually we've
> > > > > > already seen a requirement in the past for virito-blk).
> > > > >
> > > > > The point is that the page will be released by page pool directly,
> > > > > we will have no change to unmap that, if we work with page pool.
> > > >
> > > > I mean if we have a fallback, there would be no need to release these
> > > > pages but put them into a link list.
> > >
> > >
> > > What fallback?
> >
> > https://lore.kernel.org/netdev/1519607771-20613-1-git-send-email-mst@redhat.com/
> >
> > >
> > > If we put the pages to the link list, why we use the page pool?
> >
> > The size of the cache and ptr_ring needs to be fixed.
> >
> > Again, as explained above, it needs more benchmarks and looks like a
> > separate topic.
> >
> > >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > If we were to unmap pages each time before
> > > > > > > returning them to the pool, we would negate the benefits of bypassing the
> > > > > > > mapping and unmapping process altogether.
> > > > > >
> > > > > > Yes, but the problem in this approach is that it creates a corner
> > > > > > exception where dma_addr is used outside the page pool.
> > > > >
> > > > > YES. This is a corner exception. We need to introduce this case to the page
> > > > > pool.
> > > > >
> > > > > So for introducing the page-pool to virtio-net(not only for big mode),
> > > > > we may need to push the page-pool to support dma by drivers.
> > > >
> > > > Adding Jesper for some comments.
> > > >
> > > > >
> > > > > Back to this patch set, I think we should keep the virtio-net to manage
> > > > > the pages.
> > > > >
> > > > > What do you think?
> > > >
> > > > I might be wrong, but I think if we need to either
> > > >
> > > > 1) seek a way to manage the pages by yourself but not touching page
> > > > pool metadata (or Jesper is fine with this)
> > >
> > > Do you mean working with page pool or not?
> > >
> >
> > I meant if Jesper is fine with reusing page pool metadata like this patch.
> >
> > > If we manage the pages by self(no page pool), we do not care the metadata is for
> > > page pool or not. We just use the space of pages like the "private".
> >
> > That's also fine.
> >
> > >
> > >
> > > > 2) optimize the unmap for page pool
> > > >
> > > > or even
> > > >
> > > > 3) just do dma_unmap before returning the page back to the page pool,
> > > > we don't get all the benefits of page pool but we end up with simple
> > > > codes (no fallback for premapping).
> > >
> > > I am ok for this.
> >
> > Right, we just need to make sure there's no performance regression,
> > then it would be fine.
> >
> > I see for example mana did this as well.
>
> I think we should not use page pool directly now,
> because the mana does not need a space to store the dma address.
> We need to store the dma address for unmapping.
>
> If we use page pool without PP_FLAG_DMA_MAP, then store the dma address by
> page.dma_addr, I think that is not safe.

Jesper, could you comment on this?

>
> I think the way of this patch set is fine.

So it reuses page pool structure in the page structure for another use case.

> We just use the
> space of the page whatever it is page pool or not to store
> the link and dma address.

Probably because we've already "abused" page->private. I would leave
it for other maintainers to decide.

Thanks

>
> Thanks.
>
> >
> > Thanks
> >
> > >
> > >
> > > Thanks.
> > >
> > > >
> > > > Thanks
> > > >
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > Maybe for big mode it doesn't matter too much if there's no
> > > > > > performance improvement.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
> >
>
Xuan Zhuo April 17, 2024, 8:20 a.m. UTC | #13
On Wed, 17 Apr 2024 12:08:10 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Wed, Apr 17, 2024 at 9:38 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Tue, 16 Apr 2024 11:24:53 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Mon, Apr 15, 2024 at 5:04 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > On Mon, 15 Apr 2024 16:56:45 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > >
> > > > > > On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > > > > > > > > But a subsequent patch aims to make the big mode to support
> > > > > > > > > > > > premapped mode. This requires additional space to store the dma addr.
> > > > > > > > > > > >
> > > > > > > > > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > > > > > > > > variable for storing the DMA addr.
> > > > > > > > > > > >
> > > > > > > > > > > >                 struct {        /* Page cache and anonymous pages */
> > > > > > > > > > > >                         /**
> > > > > > > > > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > > > > > > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > > > > > > > > >                          * by the page owner.
> > > > > > > > > > > >                          */
> > > > > > > > > > > >                         union {
> > > > > > > > > > > >                                 struct list_head lru;
> > > > > > > > > > > >
> > > > > > > > > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > > > > > > > > >                                 struct {
> > > > > > > > > > > >                                         /* Always even, to negate PageTail */
> > > > > > > > > > > >                                         void *__filler;
> > > > > > > > > > > >                                         /* Count page's or folio's mlocks */
> > > > > > > > > > > >                                         unsigned int mlock_count;
> > > > > > > > > > > >                                 };
> > > > > > > > > > > >
> > > > > > > > > > > >                                 /* Or, free page */
> > > > > > > > > > > >                                 struct list_head buddy_list;
> > > > > > > > > > > >                                 struct list_head pcp_list;
> > > > > > > > > > > >                         };
> > > > > > > > > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > > > > > > > > >                         struct address_space *mapping;
> > > > > > > > > > > >                         union {
> > > > > > > > > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > > > > > > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > > > > > > > > >                         };
> > > > > > > > > > > >                         /**
> > > > > > > > > > > >                          * @private: Mapping-private opaque data.
> > > > > > > > > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > > > > > > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > > > > > > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > > > > > > > > >                          */
> > > > > > > > > > > >                         unsigned long private;
> > > > > > > > > > > >                 };
> > > > > > > > > > > >
> > > > > > > > > > > > But within the page pool struct, we have a variable called
> > > > > > > > > > > > dma_addr that is appropriate for storing dma addr.
> > > > > > > > > > > > And that struct is used by netstack. That works to our advantage.
> > > > > > > > > > > >
> > > > > > > > > > > >                 struct {        /* page_pool used by netstack */
> > > > > > > > > > > >                         /**
> > > > > > > > > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > > > > > > > > >                          * page_pool allocated pages.
> > > > > > > > > > > >                          */
> > > > > > > > > > > >                         unsigned long pp_magic;
> > > > > > > > > > > >                         struct page_pool *pp;
> > > > > > > > > > > >                         unsigned long _pp_mapping_pad;
> > > > > > > > > > > >                         unsigned long dma_addr;
> > > > > > > > > > > >                         atomic_long_t pp_ref_count;
> > > > > > > > > > > >                 };
> > > > > > > > > > > >
> > > > > > > > > > > > On the other side, we should use variables from the same sub-struct.
> > > > > > > > > > > > So this patch replaces the "private" with "pp".
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > > ---
> > > > > > > > > > >
> > > > > > > > > > > Instead of doing a customized version of page pool, can we simply
> > > > > > > > > > > switch to use page pool for big mode instead? Then we don't need to
> > > > > > > > > > > bother the dma stuffs.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > The page pool needs to do the dma by the DMA APIs.
> > > > > > > > > > So we can not use the page pool directly.
> > > > > > > > >
> > > > > > > > > I found this:
> > > > > > > > >
> > > > > > > > > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> > > > > > > > >                                         * map/unmap
> > > > > > > > >
> > > > > > > > > It seems to work here?
> > > > > > > >
> > > > > > > >
> > > > > > > > I have studied the page pool mechanism and believe that we cannot use it
> > > > > > > > directly. We can make the page pool to bypass the DMA operations.
> > > > > > > > This allows us to handle DMA within virtio-net for pages allocated from the page
> > > > > > > > pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> > > > > > > > to the page.
> > > > > > > >
> > > > > > > > However, the critical issue pertains to unmapping. Ideally, we want to return
> > > > > > > > the mapped pages to the page pool and reuse them. In doing so, we can omit the
> > > > > > > > unmapping and remapping steps.
> > > > > > > >
> > > > > > > > Currently, there's a caveat: when the page pool cache is full, it disconnects
> > > > > > > > and releases the pages. When the pool hits its capacity, pages are relinquished
> > > > > > > > without a chance for unmapping.
> > > > > > >
> > > > > > > Technically, when ptr_ring is full there could be a fallback, but then
> > > > > > > it requires expensive synchronization between producer and consumer.
> > > > > > > For virtio-net, it might not be a problem because add/get has been
> > > > > > > synchronized. (It might be relaxed in the future, actually we've
> > > > > > > already seen a requirement in the past for virito-blk).
> > > > > >
> > > > > > The point is that the page will be released by page pool directly,
> > > > > > we will have no change to unmap that, if we work with page pool.
> > > > >
> > > > > I mean if we have a fallback, there would be no need to release these
> > > > > pages but put them into a link list.
> > > >
> > > >
> > > > What fallback?
> > >
> > > https://lore.kernel.org/netdev/1519607771-20613-1-git-send-email-mst@redhat.com/
> > >
> > > >
> > > > If we put the pages to the link list, why we use the page pool?
> > >
> > > The size of the cache and ptr_ring needs to be fixed.
> > >
> > > Again, as explained above, it needs more benchmarks and looks like a
> > > separate topic.
> > >
> > > >
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > If we were to unmap pages each time before
> > > > > > > > returning them to the pool, we would negate the benefits of bypassing the
> > > > > > > > mapping and unmapping process altogether.
> > > > > > >
> > > > > > > Yes, but the problem in this approach is that it creates a corner
> > > > > > > exception where dma_addr is used outside the page pool.
> > > > > >
> > > > > > YES. This is a corner exception. We need to introduce this case to the page
> > > > > > pool.
> > > > > >
> > > > > > So for introducing the page-pool to virtio-net(not only for big mode),
> > > > > > we may need to push the page-pool to support dma by drivers.
> > > > >
> > > > > Adding Jesper for some comments.
> > > > >
> > > > > >
> > > > > > Back to this patch set, I think we should keep the virtio-net to manage
> > > > > > the pages.
> > > > > >
> > > > > > What do you think?
> > > > >
> > > > > I might be wrong, but I think if we need to either
> > > > >
> > > > > 1) seek a way to manage the pages by yourself but not touching page
> > > > > pool metadata (or Jesper is fine with this)
> > > >
> > > > Do you mean working with page pool or not?
> > > >
> > >
> > > I meant if Jesper is fine with reusing page pool metadata like this patch.
> > >
> > > > If we manage the pages by self(no page pool), we do not care the metadata is for
> > > > page pool or not. We just use the space of pages like the "private".
> > >
> > > That's also fine.
> > >
> > > >
> > > >
> > > > > 2) optimize the unmap for page pool
> > > > >
> > > > > or even
> > > > >
> > > > > 3) just do dma_unmap before returning the page back to the page pool,
> > > > > we don't get all the benefits of page pool but we end up with simple
> > > > > codes (no fallback for premapping).
> > > >
> > > > I am ok for this.
> > >
> > > Right, we just need to make sure there's no performance regression,
> > > then it would be fine.
> > >
> > > I see for example mana did this as well.
> >
> > I think we should not use page pool directly now,
> > because the mana does not need a space to store the dma address.
> > We need to store the dma address for unmapping.
> >
> > If we use page pool without PP_FLAG_DMA_MAP, then store the dma address by
> > page.dma_addr, I think that is not safe.
>
> Jesper, could you comment on this?
>
> >
> > I think the way of this patch set is fine.
>
> So it reuses page pool structure in the page structure for another use case.
>
> > We just use the
> > space of the page whatever it is page pool or not to store
> > the link and dma address.
>
> Probably because we've already "abused" page->private. I would leave
> it for other maintainers to decide.

If we do not want to use the elements of the page directly,
the page pool is a good way.

But we must make the page pool to work without PP_FLAG_DMA_MAP, because the
virtio-net must use the DMA APIs wrapped by virtio core.

And we still need to store the dma address, because virtio-net
can not access the descs directly.

@Jesper can we setup the page pool without PP_FLAG_DMA_MAP,
and call page_pool_set_dma_addr() from the virtio-net driver?

Thanks.



>
> Thanks
>
> >
> > Thanks.
> >
> > >
> > > Thanks
> > >
> > > >
> > > >
> > > > Thanks.
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > >
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > Maybe for big mode it doesn't matter too much if there's no
> > > > > > > performance improvement.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Thanks
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> > >
> >
>
Jason Wang April 18, 2024, 4:15 a.m. UTC | #14
On Wed, Apr 17, 2024 at 4:45 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Wed, 17 Apr 2024 12:08:10 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Wed, Apr 17, 2024 at 9:38 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Tue, 16 Apr 2024 11:24:53 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Mon, Apr 15, 2024 at 5:04 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > > > On Mon, 15 Apr 2024 16:56:45 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > >
> > > > > > > On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > > > > > > > > > But a subsequent patch aims to make the big mode to support
> > > > > > > > > > > > > premapped mode. This requires additional space to store the dma addr.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > > > > > > > > > variable for storing the DMA addr.
> > > > > > > > > > > > >
> > > > > > > > > > > > >                 struct {        /* Page cache and anonymous pages */
> > > > > > > > > > > > >                         /**
> > > > > > > > > > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > > > > > > > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > > > > > > > > > >                          * by the page owner.
> > > > > > > > > > > > >                          */
> > > > > > > > > > > > >                         union {
> > > > > > > > > > > > >                                 struct list_head lru;
> > > > > > > > > > > > >
> > > > > > > > > > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > > > > > > > > > >                                 struct {
> > > > > > > > > > > > >                                         /* Always even, to negate PageTail */
> > > > > > > > > > > > >                                         void *__filler;
> > > > > > > > > > > > >                                         /* Count page's or folio's mlocks */
> > > > > > > > > > > > >                                         unsigned int mlock_count;
> > > > > > > > > > > > >                                 };
> > > > > > > > > > > > >
> > > > > > > > > > > > >                                 /* Or, free page */
> > > > > > > > > > > > >                                 struct list_head buddy_list;
> > > > > > > > > > > > >                                 struct list_head pcp_list;
> > > > > > > > > > > > >                         };
> > > > > > > > > > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > > > > > > > > > >                         struct address_space *mapping;
> > > > > > > > > > > > >                         union {
> > > > > > > > > > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > > > > > > > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > > > > > > > > > >                         };
> > > > > > > > > > > > >                         /**
> > > > > > > > > > > > >                          * @private: Mapping-private opaque data.
> > > > > > > > > > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > > > > > > > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > > > > > > > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > > > > > > > > > >                          */
> > > > > > > > > > > > >                         unsigned long private;
> > > > > > > > > > > > >                 };
> > > > > > > > > > > > >
> > > > > > > > > > > > > But within the page pool struct, we have a variable called
> > > > > > > > > > > > > dma_addr that is appropriate for storing dma addr.
> > > > > > > > > > > > > And that struct is used by netstack. That works to our advantage.
> > > > > > > > > > > > >
> > > > > > > > > > > > >                 struct {        /* page_pool used by netstack */
> > > > > > > > > > > > >                         /**
> > > > > > > > > > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > > > > > > > > > >                          * page_pool allocated pages.
> > > > > > > > > > > > >                          */
> > > > > > > > > > > > >                         unsigned long pp_magic;
> > > > > > > > > > > > >                         struct page_pool *pp;
> > > > > > > > > > > > >                         unsigned long _pp_mapping_pad;
> > > > > > > > > > > > >                         unsigned long dma_addr;
> > > > > > > > > > > > >                         atomic_long_t pp_ref_count;
> > > > > > > > > > > > >                 };
> > > > > > > > > > > > >
> > > > > > > > > > > > > On the other side, we should use variables from the same sub-struct.
> > > > > > > > > > > > > So this patch replaces the "private" with "pp".
> > > > > > > > > > > > >
> > > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > > > ---
> > > > > > > > > > > >
> > > > > > > > > > > > Instead of doing a customized version of page pool, can we simply
> > > > > > > > > > > > switch to use page pool for big mode instead? Then we don't need to
> > > > > > > > > > > > bother the dma stuffs.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > The page pool needs to do the dma by the DMA APIs.
> > > > > > > > > > > So we can not use the page pool directly.
> > > > > > > > > >
> > > > > > > > > > I found this:
> > > > > > > > > >
> > > > > > > > > > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> > > > > > > > > >                                         * map/unmap
> > > > > > > > > >
> > > > > > > > > > It seems to work here?
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > I have studied the page pool mechanism and believe that we cannot use it
> > > > > > > > > directly. We can make the page pool to bypass the DMA operations.
> > > > > > > > > This allows us to handle DMA within virtio-net for pages allocated from the page
> > > > > > > > > pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> > > > > > > > > to the page.
> > > > > > > > >
> > > > > > > > > However, the critical issue pertains to unmapping. Ideally, we want to return
> > > > > > > > > the mapped pages to the page pool and reuse them. In doing so, we can omit the
> > > > > > > > > unmapping and remapping steps.
> > > > > > > > >
> > > > > > > > > Currently, there's a caveat: when the page pool cache is full, it disconnects
> > > > > > > > > and releases the pages. When the pool hits its capacity, pages are relinquished
> > > > > > > > > without a chance for unmapping.
> > > > > > > >
> > > > > > > > Technically, when ptr_ring is full there could be a fallback, but then
> > > > > > > > it requires expensive synchronization between producer and consumer.
> > > > > > > > For virtio-net, it might not be a problem because add/get has been
> > > > > > > > synchronized. (It might be relaxed in the future, actually we've
> > > > > > > > already seen a requirement in the past for virito-blk).
> > > > > > >
> > > > > > > The point is that the page will be released by page pool directly,
> > > > > > > we will have no change to unmap that, if we work with page pool.
> > > > > >
> > > > > > I mean if we have a fallback, there would be no need to release these
> > > > > > pages but put them into a link list.
> > > > >
> > > > >
> > > > > What fallback?
> > > >
> > > > https://lore.kernel.org/netdev/1519607771-20613-1-git-send-email-mst@redhat.com/
> > > >
> > > > >
> > > > > If we put the pages to the link list, why we use the page pool?
> > > >
> > > > The size of the cache and ptr_ring needs to be fixed.
> > > >
> > > > Again, as explained above, it needs more benchmarks and looks like a
> > > > separate topic.
> > > >
> > > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > If we were to unmap pages each time before
> > > > > > > > > returning them to the pool, we would negate the benefits of bypassing the
> > > > > > > > > mapping and unmapping process altogether.
> > > > > > > >
> > > > > > > > Yes, but the problem in this approach is that it creates a corner
> > > > > > > > exception where dma_addr is used outside the page pool.
> > > > > > >
> > > > > > > YES. This is a corner exception. We need to introduce this case to the page
> > > > > > > pool.
> > > > > > >
> > > > > > > So for introducing the page-pool to virtio-net(not only for big mode),
> > > > > > > we may need to push the page-pool to support dma by drivers.
> > > > > >
> > > > > > Adding Jesper for some comments.
> > > > > >
> > > > > > >
> > > > > > > Back to this patch set, I think we should keep the virtio-net to manage
> > > > > > > the pages.
> > > > > > >
> > > > > > > What do you think?
> > > > > >
> > > > > > I might be wrong, but I think if we need to either
> > > > > >
> > > > > > 1) seek a way to manage the pages by yourself but not touching page
> > > > > > pool metadata (or Jesper is fine with this)
> > > > >
> > > > > Do you mean working with page pool or not?
> > > > >
> > > >
> > > > I meant if Jesper is fine with reusing page pool metadata like this patch.
> > > >
> > > > > If we manage the pages by self(no page pool), we do not care the metadata is for
> > > > > page pool or not. We just use the space of pages like the "private".
> > > >
> > > > That's also fine.
> > > >
> > > > >
> > > > >
> > > > > > 2) optimize the unmap for page pool
> > > > > >
> > > > > > or even
> > > > > >
> > > > > > 3) just do dma_unmap before returning the page back to the page pool,
> > > > > > we don't get all the benefits of page pool but we end up with simple
> > > > > > codes (no fallback for premapping).
> > > > >
> > > > > I am ok for this.
> > > >
> > > > Right, we just need to make sure there's no performance regression,
> > > > then it would be fine.
> > > >
> > > > I see for example mana did this as well.
> > >
> > > I think we should not use page pool directly now,
> > > because the mana does not need a space to store the dma address.
> > > We need to store the dma address for unmapping.
> > >
> > > If we use page pool without PP_FLAG_DMA_MAP, then store the dma address by
> > > page.dma_addr, I think that is not safe.
> >
> > Jesper, could you comment on this?
> >
> > >
> > > I think the way of this patch set is fine.
> >
> > So it reuses page pool structure in the page structure for another use case.
> >
> > > We just use the
> > > space of the page whatever it is page pool or not to store
> > > the link and dma address.
> >
> > Probably because we've already "abused" page->private. I would leave
> > it for other maintainers to decide.
>
> If we do not want to use the elements of the page directly,
> the page pool is a good way.
>

Rethink this, I think that the approach of this series should be fine.
It should be sufficient for virtio-net to guarantee that those pages
are not used by the page pool.

I will continue the review.

Thanks
Jason Wang April 18, 2024, 4:16 a.m. UTC | #15
On Thu, Apr 18, 2024 at 12:15 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Apr 17, 2024 at 4:45 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Wed, 17 Apr 2024 12:08:10 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Wed, Apr 17, 2024 at 9:38 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > On Tue, 16 Apr 2024 11:24:53 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > On Mon, Apr 15, 2024 at 5:04 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > >
> > > > > > On Mon, 15 Apr 2024 16:56:45 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > > On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > > > > On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Now, we chain the pages of big mode by the page's private variable.
> > > > > > > > > > > > > > But a subsequent patch aims to make the big mode to support
> > > > > > > > > > > > > > premapped mode. This requires additional space to store the dma addr.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Within the sub-struct that contains the 'private', there is no suitable
> > > > > > > > > > > > > > variable for storing the DMA addr.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >                 struct {        /* Page cache and anonymous pages */
> > > > > > > > > > > > > >                         /**
> > > > > > > > > > > > > >                          * @lru: Pageout list, eg. active_list protected by
> > > > > > > > > > > > > >                          * lruvec->lru_lock.  Sometimes used as a generic list
> > > > > > > > > > > > > >                          * by the page owner.
> > > > > > > > > > > > > >                          */
> > > > > > > > > > > > > >                         union {
> > > > > > > > > > > > > >                                 struct list_head lru;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >                                 /* Or, for the Unevictable "LRU list" slot */
> > > > > > > > > > > > > >                                 struct {
> > > > > > > > > > > > > >                                         /* Always even, to negate PageTail */
> > > > > > > > > > > > > >                                         void *__filler;
> > > > > > > > > > > > > >                                         /* Count page's or folio's mlocks */
> > > > > > > > > > > > > >                                         unsigned int mlock_count;
> > > > > > > > > > > > > >                                 };
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >                                 /* Or, free page */
> > > > > > > > > > > > > >                                 struct list_head buddy_list;
> > > > > > > > > > > > > >                                 struct list_head pcp_list;
> > > > > > > > > > > > > >                         };
> > > > > > > > > > > > > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > > > > > > > > > > > > >                         struct address_space *mapping;
> > > > > > > > > > > > > >                         union {
> > > > > > > > > > > > > >                                 pgoff_t index;          /* Our offset within mapping. */
> > > > > > > > > > > > > >                                 unsigned long share;    /* share count for fsdax */
> > > > > > > > > > > > > >                         };
> > > > > > > > > > > > > >                         /**
> > > > > > > > > > > > > >                          * @private: Mapping-private opaque data.
> > > > > > > > > > > > > >                          * Usually used for buffer_heads if PagePrivate.
> > > > > > > > > > > > > >                          * Used for swp_entry_t if PageSwapCache.
> > > > > > > > > > > > > >                          * Indicates order in the buddy system if PageBuddy.
> > > > > > > > > > > > > >                          */
> > > > > > > > > > > > > >                         unsigned long private;
> > > > > > > > > > > > > >                 };
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > But within the page pool struct, we have a variable called
> > > > > > > > > > > > > > dma_addr that is appropriate for storing dma addr.
> > > > > > > > > > > > > > And that struct is used by netstack. That works to our advantage.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >                 struct {        /* page_pool used by netstack */
> > > > > > > > > > > > > >                         /**
> > > > > > > > > > > > > >                          * @pp_magic: magic value to avoid recycling non
> > > > > > > > > > > > > >                          * page_pool allocated pages.
> > > > > > > > > > > > > >                          */
> > > > > > > > > > > > > >                         unsigned long pp_magic;
> > > > > > > > > > > > > >                         struct page_pool *pp;
> > > > > > > > > > > > > >                         unsigned long _pp_mapping_pad;
> > > > > > > > > > > > > >                         unsigned long dma_addr;
> > > > > > > > > > > > > >                         atomic_long_t pp_ref_count;
> > > > > > > > > > > > > >                 };
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > On the other side, we should use variables from the same sub-struct.
> > > > > > > > > > > > > > So this patch replaces the "private" with "pp".
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > >
> > > > > > > > > > > > > Instead of doing a customized version of page pool, can we simply
> > > > > > > > > > > > > switch to use page pool for big mode instead? Then we don't need to
> > > > > > > > > > > > > bother the dma stuffs.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > The page pool needs to do the dma by the DMA APIs.
> > > > > > > > > > > > So we can not use the page pool directly.
> > > > > > > > > > >
> > > > > > > > > > > I found this:
> > > > > > > > > > >
> > > > > > > > > > > define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> > > > > > > > > > >                                         * map/unmap
> > > > > > > > > > >
> > > > > > > > > > > It seems to work here?
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I have studied the page pool mechanism and believe that we cannot use it
> > > > > > > > > > directly. We can make the page pool to bypass the DMA operations.
> > > > > > > > > > This allows us to handle DMA within virtio-net for pages allocated from the page
> > > > > > > > > > pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> > > > > > > > > > to the page.
> > > > > > > > > >
> > > > > > > > > > However, the critical issue pertains to unmapping. Ideally, we want to return
> > > > > > > > > > the mapped pages to the page pool and reuse them. In doing so, we can omit the
> > > > > > > > > > unmapping and remapping steps.
> > > > > > > > > >
> > > > > > > > > > Currently, there's a caveat: when the page pool cache is full, it disconnects
> > > > > > > > > > and releases the pages. When the pool hits its capacity, pages are relinquished
> > > > > > > > > > without a chance for unmapping.
> > > > > > > > >
> > > > > > > > > Technically, when ptr_ring is full there could be a fallback, but then
> > > > > > > > > it requires expensive synchronization between producer and consumer.
> > > > > > > > > For virtio-net, it might not be a problem because add/get has been
> > > > > > > > > synchronized. (It might be relaxed in the future, actually we've
> > > > > > > > > already seen a requirement in the past for virito-blk).
> > > > > > > >
> > > > > > > > The point is that the page will be released by page pool directly,
> > > > > > > > we will have no change to unmap that, if we work with page pool.
> > > > > > >
> > > > > > > I mean if we have a fallback, there would be no need to release these
> > > > > > > pages but put them into a link list.
> > > > > >
> > > > > >
> > > > > > What fallback?
> > > > >
> > > > > https://lore.kernel.org/netdev/1519607771-20613-1-git-send-email-mst@redhat.com/
> > > > >
> > > > > >
> > > > > > If we put the pages to the link list, why we use the page pool?
> > > > >
> > > > > The size of the cache and ptr_ring needs to be fixed.
> > > > >
> > > > > Again, as explained above, it needs more benchmarks and looks like a
> > > > > separate topic.
> > > > >
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > If we were to unmap pages each time before
> > > > > > > > > > returning them to the pool, we would negate the benefits of bypassing the
> > > > > > > > > > mapping and unmapping process altogether.
> > > > > > > > >
> > > > > > > > > Yes, but the problem in this approach is that it creates a corner
> > > > > > > > > exception where dma_addr is used outside the page pool.
> > > > > > > >
> > > > > > > > YES. This is a corner exception. We need to introduce this case to the page
> > > > > > > > pool.
> > > > > > > >
> > > > > > > > So for introducing the page-pool to virtio-net(not only for big mode),
> > > > > > > > we may need to push the page-pool to support dma by drivers.
> > > > > > >
> > > > > > > Adding Jesper for some comments.
> > > > > > >
> > > > > > > >
> > > > > > > > Back to this patch set, I think we should keep the virtio-net to manage
> > > > > > > > the pages.
> > > > > > > >
> > > > > > > > What do you think?
> > > > > > >
> > > > > > > I might be wrong, but I think if we need to either
> > > > > > >
> > > > > > > 1) seek a way to manage the pages by yourself but not touching page
> > > > > > > pool metadata (or Jesper is fine with this)
> > > > > >
> > > > > > Do you mean working with page pool or not?
> > > > > >
> > > > >
> > > > > I meant if Jesper is fine with reusing page pool metadata like this patch.
> > > > >
> > > > > > If we manage the pages by self(no page pool), we do not care the metadata is for
> > > > > > page pool or not. We just use the space of pages like the "private".
> > > > >
> > > > > That's also fine.
> > > > >
> > > > > >
> > > > > >
> > > > > > > 2) optimize the unmap for page pool
> > > > > > >
> > > > > > > or even
> > > > > > >
> > > > > > > 3) just do dma_unmap before returning the page back to the page pool,
> > > > > > > we don't get all the benefits of page pool but we end up with simple
> > > > > > > codes (no fallback for premapping).
> > > > > >
> > > > > > I am ok for this.
> > > > >
> > > > > Right, we just need to make sure there's no performance regression,
> > > > > then it would be fine.
> > > > >
> > > > > I see for example mana did this as well.
> > > >
> > > > I think we should not use page pool directly now,
> > > > because the mana does not need a space to store the dma address.
> > > > We need to store the dma address for unmapping.
> > > >
> > > > If we use page pool without PP_FLAG_DMA_MAP, then store the dma address by
> > > > page.dma_addr, I think that is not safe.
> > >
> > > Jesper, could you comment on this?
> > >
> > > >
> > > > I think the way of this patch set is fine.
> > >
> > > So it reuses page pool structure in the page structure for another use case.
> > >
> > > > We just use the
> > > > space of the page whatever it is page pool or not to store
> > > > the link and dma address.
> > >
> > > Probably because we've already "abused" page->private. I would leave
> > > it for other maintainers to decide.
> >
> > If we do not want to use the elements of the page directly,
> > the page pool is a good way.
> >
>
> Rethink this, I think that the approach of this series should be fine.
> It should be sufficient for virtio-net to guarantee that those pages
> are not used by the page pool.
>
> I will continue the review.

Btw, it would be better to describe those design considerations in the
changelog (e.g why we don't use page pool etc).

Thanks

>
> Thanks
Jason Wang April 18, 2024, 6:11 a.m. UTC | #16
On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> Now, we chain the pages of big mode by the page's private variable.
> But a subsequent patch aims to make the big mode to support
> premapped mode. This requires additional space to store the dma addr.
>
> Within the sub-struct that contains the 'private', there is no suitable
> variable for storing the DMA addr.
>
>                 struct {        /* Page cache and anonymous pages */
>                         /**
>                          * @lru: Pageout list, eg. active_list protected by
>                          * lruvec->lru_lock.  Sometimes used as a generic list
>                          * by the page owner.
>                          */
>                         union {
>                                 struct list_head lru;
>
>                                 /* Or, for the Unevictable "LRU list" slot */
>                                 struct {
>                                         /* Always even, to negate PageTail */
>                                         void *__filler;
>                                         /* Count page's or folio's mlocks */
>                                         unsigned int mlock_count;
>                                 };
>
>                                 /* Or, free page */
>                                 struct list_head buddy_list;
>                                 struct list_head pcp_list;
>                         };
>                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
>                         struct address_space *mapping;
>                         union {
>                                 pgoff_t index;          /* Our offset within mapping. */
>                                 unsigned long share;    /* share count for fsdax */
>                         };
>                         /**
>                          * @private: Mapping-private opaque data.
>                          * Usually used for buffer_heads if PagePrivate.
>                          * Used for swp_entry_t if PageSwapCache.
>                          * Indicates order in the buddy system if PageBuddy.
>                          */
>                         unsigned long private;
>                 };
>
> But within the page pool struct, we have a variable called
> dma_addr that is appropriate for storing dma addr.
> And that struct is used by netstack. That works to our advantage.
>
>                 struct {        /* page_pool used by netstack */
>                         /**
>                          * @pp_magic: magic value to avoid recycling non
>                          * page_pool allocated pages.
>                          */
>                         unsigned long pp_magic;
>                         struct page_pool *pp;
>                         unsigned long _pp_mapping_pad;
>                         unsigned long dma_addr;
>                         atomic_long_t pp_ref_count;
>                 };
>
> On the other side, we should use variables from the same sub-struct.
> So this patch replaces the "private" with "pp".
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>  drivers/net/virtio_net.c | 27 +++++++++++++++------------
>  1 file changed, 15 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c22d1118a133..4446fb54de6d 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -48,6 +48,9 @@ module_param(napi_tx, bool, 0644);
>
>  #define VIRTIO_XDP_FLAG        BIT(0)
>
> +#define page_chain_next(p)     ((struct page *)((p)->pp))
> +#define page_chain_add(p, n)   ((p)->pp = (void *)n)
> +
>  /* RX packet size EWMA. The average packet size is used to determine the packet
>   * buffer size when refilling RX rings. As the entire RX ring may be refilled
>   * at once, the weight is chosen so that the EWMA will be insensitive to short-
> @@ -191,7 +194,7 @@ struct receive_queue {
>
>         struct virtnet_interrupt_coalesce intr_coal;
>
> -       /* Chain pages by the private ptr. */
> +       /* Chain pages by the page's pp struct. */
>         struct page *pages;
>
>         /* Average packet length for mergeable receive buffers. */
> @@ -432,16 +435,16 @@ skb_vnet_common_hdr(struct sk_buff *skb)
>  }
>
>  /*
> - * private is used to chain pages for big packets, put the whole
> - * most recent used list in the beginning for reuse
> + * put the whole most recent used list in the beginning for reuse
>   */

While at this, let's explain the pp is used to chain pages or we can
do it on the definition of page_chain_add().

Others look good.

Acked-by: Jason Wang <jasowang@redhat.com>

Thanks
Jesper Dangaard Brouer April 18, 2024, 8:19 p.m. UTC | #17
On 17/04/2024 10.20, Xuan Zhuo wrote:
> On Wed, 17 Apr 2024 12:08:10 +0800, Jason Wang <jasowang@redhat.com> wrote:
>> On Wed, Apr 17, 2024 at 9:38 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>>
>>> On Tue, 16 Apr 2024 11:24:53 +0800, Jason Wang <jasowang@redhat.com> wrote:
>>>> On Mon, Apr 15, 2024 at 5:04 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>>>>
>>>>> On Mon, 15 Apr 2024 16:56:45 +0800, Jason Wang <jasowang@redhat.com> wrote:
>>>>>> On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>>>>>>
>>>>>>> On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>> On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>>>>>>>>
>>>>>>>>> On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>>> On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>>>>> On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>> Now, we chain the pages of big mode by the page's private variable.
>>>>>>>>>>>>> But a subsequent patch aims to make the big mode to support
>>>>>>>>>>>>> premapped mode. This requires additional space to store the dma addr.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Within the sub-struct that contains the 'private', there is no suitable
>>>>>>>>>>>>> variable for storing the DMA addr.
>>>>>>>>>>>>>
>>>>>>>>>>>>>                  struct {        /* Page cache and anonymous pages */
>>>>>>>>>>>>>                          /**
>>>>>>>>>>>>>                           * @lru: Pageout list, eg. active_list protected by
>>>>>>>>>>>>>                           * lruvec->lru_lock.  Sometimes used as a generic list
>>>>>>>>>>>>>                           * by the page owner.
>>>>>>>>>>>>>                           */
>>>>>>>>>>>>>                          union {
>>>>>>>>>>>>>                                  struct list_head lru;
>>>>>>>>>>>>>
>>>>>>>>>>>>>                                  /* Or, for the Unevictable "LRU list" slot */
>>>>>>>>>>>>>                                  struct {
>>>>>>>>>>>>>                                          /* Always even, to negate PageTail */
>>>>>>>>>>>>>                                          void *__filler;
>>>>>>>>>>>>>                                          /* Count page's or folio's mlocks */
>>>>>>>>>>>>>                                          unsigned int mlock_count;
>>>>>>>>>>>>>                                  };
>>>>>>>>>>>>>
>>>>>>>>>>>>>                                  /* Or, free page */
>>>>>>>>>>>>>                                  struct list_head buddy_list;
>>>>>>>>>>>>>                                  struct list_head pcp_list;
>>>>>>>>>>>>>                          };
>>>>>>>>>>>>>                          /* See page-flags.h for PAGE_MAPPING_FLAGS */
>>>>>>>>>>>>>                          struct address_space *mapping;
>>>>>>>>>>>>>                          union {
>>>>>>>>>>>>>                                  pgoff_t index;          /* Our offset within mapping. */
>>>>>>>>>>>>>                                  unsigned long share;    /* share count for fsdax */
>>>>>>>>>>>>>                          };
>>>>>>>>>>>>>                          /**
>>>>>>>>>>>>>                           * @private: Mapping-private opaque data.
>>>>>>>>>>>>>                           * Usually used for buffer_heads if PagePrivate.
>>>>>>>>>>>>>                           * Used for swp_entry_t if PageSwapCache.
>>>>>>>>>>>>>                           * Indicates order in the buddy system if PageBuddy.
>>>>>>>>>>>>>                           */
>>>>>>>>>>>>>                          unsigned long private;
>>>>>>>>>>>>>                  };
>>>>>>>>>>>>>
>>>>>>>>>>>>> But within the page pool struct, we have a variable called
>>>>>>>>>>>>> dma_addr that is appropriate for storing dma addr.
>>>>>>>>>>>>> And that struct is used by netstack. That works to our advantage.
>>>>>>>>>>>>>
>>>>>>>>>>>>>                  struct {        /* page_pool used by netstack */
>>>>>>>>>>>>>                          /**
>>>>>>>>>>>>>                           * @pp_magic: magic value to avoid recycling non
>>>>>>>>>>>>>                           * page_pool allocated pages.
>>>>>>>>>>>>>                           */
>>>>>>>>>>>>>                          unsigned long pp_magic;
>>>>>>>>>>>>>                          struct page_pool *pp;
>>>>>>>>>>>>>                          unsigned long _pp_mapping_pad;
>>>>>>>>>>>>>                          unsigned long dma_addr;
>>>>>>>>>>>>>                          atomic_long_t pp_ref_count;
>>>>>>>>>>>>>                  };
>>>>>>>>>>>>>
>>>>>>>>>>>>> On the other side, we should use variables from the same sub-struct.
>>>>>>>>>>>>> So this patch replaces the "private" with "pp".
>>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>
>>>>>>>>>>>> Instead of doing a customized version of page pool, can we simply
>>>>>>>>>>>> switch to use page pool for big mode instead? Then we don't need to
>>>>>>>>>>>> bother the dma stuffs.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> The page pool needs to do the dma by the DMA APIs.
>>>>>>>>>>> So we can not use the page pool directly.
>>>>>>>>>>
>>>>>>>>>> I found this:
>>>>>>>>>>
>>>>>>>>>> define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
>>>>>>>>>>                                          * map/unmap
>>>>>>>>>>
>>>>>>>>>> It seems to work here?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I have studied the page pool mechanism and believe that we cannot use it
>>>>>>>>> directly. We can make the page pool to bypass the DMA operations.
>>>>>>>>> This allows us to handle DMA within virtio-net for pages allocated from the page
>>>>>>>>> pool. Furthermore, we can utilize page pool helpers to associate the DMA address
>>>>>>>>> to the page.
>>>>>>>>>
>>>>>>>>> However, the critical issue pertains to unmapping. Ideally, we want to return
>>>>>>>>> the mapped pages to the page pool and reuse them. In doing so, we can omit the
>>>>>>>>> unmapping and remapping steps.
>>>>>>>>>
>>>>>>>>> Currently, there's a caveat: when the page pool cache is full, it disconnects
>>>>>>>>> and releases the pages. When the pool hits its capacity, pages are relinquished
>>>>>>>>> without a chance for unmapping.

Could Jakub's memory provider for PP help your use-case?

See: [1] 
https://lore.kernel.org/all/20240403002053.2376017-3-almasrymina@google.com/
Or: [2]
https://lore.kernel.org/netdev/f8270765-a27b-6ccf-33ea-cda097168d79@redhat.com/T/


[...]
>>>>>>
>>>>>> Adding Jesper for some comments.
>>>>>>
>>>>>>>
>>>>>>> Back to this patch set, I think we should keep the virtio-net to manage
>>>>>>> the pages.
>>>>>>>

For context the patch:
  [3] 
https://lore.kernel.org/all/20240411025127.51945-4-xuanzhuo@linux.alibaba.com/

>>>>>>> What do you think?
>>>>>>
>>>>>> I might be wrong, but I think if we need to either
>>>>>>
>>>>>> 1) seek a way to manage the pages by yourself but not touching page
>>>>>> pool metadata (or Jesper is fine with this)
>>>>>
>>>>> Do you mean working with page pool or not?
>>>>>
>>>>
>>>> I meant if Jesper is fine with reusing page pool metadata like this patch.
>>>>
>>>>> If we manage the pages by self(no page pool), we do not care the metadata is for
>>>>> page pool or not. We just use the space of pages like the "private".
>>>>
>>>> That's also fine.
>>>>

I'm not sure it is "fine" to, explicitly choosing not to use page pool,
and then (ab)use `struct page` member (pp) that intended for page_pool
for other stuff. (In this case create a linked list of pages).

  +#define page_chain_next(p)	((struct page *)((p)->pp))
  +#define page_chain_add(p, n)	((p)->pp = (void *)n)

I'm not sure that I (as PP maintainer) can make this call actually, as I
think this area belong with the MM "page" maintainers (Cc MM-list +
people) to judge.

Just invention new ways to use struct page fields without adding your
use-case to struct page, will make it harder for MM people to maintain
(e.g. make future change).

--Jesper
Matthew Wilcox April 18, 2024, 9:56 p.m. UTC | #18
On Thu, Apr 18, 2024 at 10:19:33PM +0200, Jesper Dangaard Brouer wrote:
> I'm not sure it is "fine" to, explicitly choosing not to use page pool,
> and then (ab)use `struct page` member (pp) that intended for page_pool
> for other stuff. (In this case create a linked list of pages).
> 
>  +#define page_chain_next(p)	((struct page *)((p)->pp))
>  +#define page_chain_add(p, n)	((p)->pp = (void *)n)
> 
> I'm not sure that I (as PP maintainer) can make this call actually, as I
> think this area belong with the MM "page" maintainers (Cc MM-list +
> people) to judge.
> 
> Just invention new ways to use struct page fields without adding your
> use-case to struct page, will make it harder for MM people to maintain
> (e.g. make future change).

I can't really follow what's being proposed; the quoting is quite deep.

Here's the current plan for struct page:

 - The individual users are being split off.  This has already happened
   for struct folio, struct slab and struct pgdesc.  Others are hopefully
   coming.
 - At some point, struct page will become:

   struct page {
     unsigned long flags;
     unsigned long data[5];
     unsigned int data2[2];
     ... some other bits and pieces ...
   };

 - After that, we will turn struct page into:

  struct page {
    unsigned long memdesc;
  };

Users like pagepool will allocate a struct ppdesc that will be
referred to by the memdesc.  The bottom 4 bits will identify it as a
ppdesc.  You can put anything you like in a struct ppdesc, it just has
to be allocated from a slab with a 16 byte alignment.

More details here:
https://kernelnewbies.org/MatthewWilcox/Memdescs

This is all likely to land in 2025.  The goal for 2024 is to remove
mapping & index from 'struct page'.  This has been in progress since
2019 so I'm really excited that we're so close!  If you want to
turn struct ppdesc into its own struct like folio, slab & ptdesc,
I'm happy to help.  I once had a patchset for that:

https://lore.kernel.org/netdev/20221130220803.3657490-1-willy@infradead.org/

but I'm sure it's truly bitrotted.
Xuan Zhuo April 19, 2024, 7:11 a.m. UTC | #19
On Thu, 18 Apr 2024 22:19:33 +0200, Jesper Dangaard Brouer <hawk@kernel.org> wrote:
>
>
> On 17/04/2024 10.20, Xuan Zhuo wrote:
> > On Wed, 17 Apr 2024 12:08:10 +0800, Jason Wang <jasowang@redhat.com> wrote:
> >> On Wed, Apr 17, 2024 at 9:38 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >>>
> >>> On Tue, 16 Apr 2024 11:24:53 +0800, Jason Wang <jasowang@redhat.com> wrote:
> >>>> On Mon, Apr 15, 2024 at 5:04 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >>>>>
> >>>>> On Mon, 15 Apr 2024 16:56:45 +0800, Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> On Mon, Apr 15, 2024 at 4:50 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >>>>>>>
> >>>>>>> On Mon, 15 Apr 2024 14:43:24 +0800, Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>> On Mon, Apr 15, 2024 at 10:35 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >>>>>>>>>
> >>>>>>>>> On Fri, 12 Apr 2024 13:49:12 +0800, Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>>>> On Fri, Apr 12, 2024 at 1:39 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >>>>>>>>>>>
> >>>>>>>>>>> On Fri, 12 Apr 2024 12:47:55 +0800, Jason Wang <jasowang@redhat.com> wrote:
> >>>>>>>>>>>> On Thu, Apr 11, 2024 at 10:51 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Now, we chain the pages of big mode by the page's private variable.
> >>>>>>>>>>>>> But a subsequent patch aims to make the big mode to support
> >>>>>>>>>>>>> premapped mode. This requires additional space to store the dma addr.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Within the sub-struct that contains the 'private', there is no suitable
> >>>>>>>>>>>>> variable for storing the DMA addr.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>                  struct {        /* Page cache and anonymous pages */
> >>>>>>>>>>>>>                          /**
> >>>>>>>>>>>>>                           * @lru: Pageout list, eg. active_list protected by
> >>>>>>>>>>>>>                           * lruvec->lru_lock.  Sometimes used as a generic list
> >>>>>>>>>>>>>                           * by the page owner.
> >>>>>>>>>>>>>                           */
> >>>>>>>>>>>>>                          union {
> >>>>>>>>>>>>>                                  struct list_head lru;
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>                                  /* Or, for the Unevictable "LRU list" slot */
> >>>>>>>>>>>>>                                  struct {
> >>>>>>>>>>>>>                                          /* Always even, to negate PageTail */
> >>>>>>>>>>>>>                                          void *__filler;
> >>>>>>>>>>>>>                                          /* Count page's or folio's mlocks */
> >>>>>>>>>>>>>                                          unsigned int mlock_count;
> >>>>>>>>>>>>>                                  };
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>                                  /* Or, free page */
> >>>>>>>>>>>>>                                  struct list_head buddy_list;
> >>>>>>>>>>>>>                                  struct list_head pcp_list;
> >>>>>>>>>>>>>                          };
> >>>>>>>>>>>>>                          /* See page-flags.h for PAGE_MAPPING_FLAGS */
> >>>>>>>>>>>>>                          struct address_space *mapping;
> >>>>>>>>>>>>>                          union {
> >>>>>>>>>>>>>                                  pgoff_t index;          /* Our offset within mapping. */
> >>>>>>>>>>>>>                                  unsigned long share;    /* share count for fsdax */
> >>>>>>>>>>>>>                          };
> >>>>>>>>>>>>>                          /**
> >>>>>>>>>>>>>                           * @private: Mapping-private opaque data.
> >>>>>>>>>>>>>                           * Usually used for buffer_heads if PagePrivate.
> >>>>>>>>>>>>>                           * Used for swp_entry_t if PageSwapCache.
> >>>>>>>>>>>>>                           * Indicates order in the buddy system if PageBuddy.
> >>>>>>>>>>>>>                           */
> >>>>>>>>>>>>>                          unsigned long private;
> >>>>>>>>>>>>>                  };
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> But within the page pool struct, we have a variable called
> >>>>>>>>>>>>> dma_addr that is appropriate for storing dma addr.
> >>>>>>>>>>>>> And that struct is used by netstack. That works to our advantage.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>                  struct {        /* page_pool used by netstack */
> >>>>>>>>>>>>>                          /**
> >>>>>>>>>>>>>                           * @pp_magic: magic value to avoid recycling non
> >>>>>>>>>>>>>                           * page_pool allocated pages.
> >>>>>>>>>>>>>                           */
> >>>>>>>>>>>>>                          unsigned long pp_magic;
> >>>>>>>>>>>>>                          struct page_pool *pp;
> >>>>>>>>>>>>>                          unsigned long _pp_mapping_pad;
> >>>>>>>>>>>>>                          unsigned long dma_addr;
> >>>>>>>>>>>>>                          atomic_long_t pp_ref_count;
> >>>>>>>>>>>>>                  };
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> On the other side, we should use variables from the same sub-struct.
> >>>>>>>>>>>>> So this patch replaces the "private" with "pp".
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> >>>>>>>>>>>>> ---
> >>>>>>>>>>>>
> >>>>>>>>>>>> Instead of doing a customized version of page pool, can we simply
> >>>>>>>>>>>> switch to use page pool for big mode instead? Then we don't need to
> >>>>>>>>>>>> bother the dma stuffs.
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> The page pool needs to do the dma by the DMA APIs.
> >>>>>>>>>>> So we can not use the page pool directly.
> >>>>>>>>>>
> >>>>>>>>>> I found this:
> >>>>>>>>>>
> >>>>>>>>>> define PP_FLAG_DMA_MAP         BIT(0) /* Should page_pool do the DMA
> >>>>>>>>>>                                          * map/unmap
> >>>>>>>>>>
> >>>>>>>>>> It seems to work here?
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> I have studied the page pool mechanism and believe that we cannot use it
> >>>>>>>>> directly. We can make the page pool to bypass the DMA operations.
> >>>>>>>>> This allows us to handle DMA within virtio-net for pages allocated from the page
> >>>>>>>>> pool. Furthermore, we can utilize page pool helpers to associate the DMA address
> >>>>>>>>> to the page.
> >>>>>>>>>
> >>>>>>>>> However, the critical issue pertains to unmapping. Ideally, we want to return
> >>>>>>>>> the mapped pages to the page pool and reuse them. In doing so, we can omit the
> >>>>>>>>> unmapping and remapping steps.
> >>>>>>>>>
> >>>>>>>>> Currently, there's a caveat: when the page pool cache is full, it disconnects
> >>>>>>>>> and releases the pages. When the pool hits its capacity, pages are relinquished
> >>>>>>>>> without a chance for unmapping.
>
> Could Jakub's memory provider for PP help your use-case?
>
> See: [1]
> https://lore.kernel.org/all/20240403002053.2376017-3-almasrymina@google.com/
> Or: [2]
> https://lore.kernel.org/netdev/f8270765-a27b-6ccf-33ea-cda097168d79@redhat.com/T/


It can not. That make the pp can get page by the callbacks.

Here we talk about the map/unmap.

The virtio-net has the different DMA APIs.

	dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, size_t size,
						  enum dma_data_direction dir, unsigned long attrs);
	void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr,
					      size_t size, enum dma_data_direction dir,
					      unsigned long attrs);
	dma_addr_t virtqueue_dma_map_page_attrs(struct virtqueue *_vq, struct page *page,
						size_t offset, size_t size,
						enum dma_data_direction dir,
						unsigned long attrs);
	void virtqueue_dma_unmap_page_attrs(struct virtqueue *_vq, dma_addr_t addr,
					    size_t size, enum dma_data_direction dir,
					    unsigned long attrs);
	int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr);

	bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr);
	void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, dma_addr_t addr,
						     unsigned long offset, size_t size,
						     enum dma_data_direction dir);
	void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, dma_addr_t addr,
							unsigned long offset, size_t size,
							enum dma_data_direction dir);


Thanks.

>
>
> [...]
> >>>>>>
> >>>>>> Adding Jesper for some comments.
> >>>>>>
> >>>>>>>
> >>>>>>> Back to this patch set, I think we should keep the virtio-net to manage
> >>>>>>> the pages.
> >>>>>>>
>
> For context the patch:
>   [3]
> https://lore.kernel.org/all/20240411025127.51945-4-xuanzhuo@linux.alibaba.com/
>
> >>>>>>> What do you think?
> >>>>>>
> >>>>>> I might be wrong, but I think if we need to either
> >>>>>>
> >>>>>> 1) seek a way to manage the pages by yourself but not touching page
> >>>>>> pool metadata (or Jesper is fine with this)
> >>>>>
> >>>>> Do you mean working with page pool or not?
> >>>>>
> >>>>
> >>>> I meant if Jesper is fine with reusing page pool metadata like this patch.
> >>>>
> >>>>> If we manage the pages by self(no page pool), we do not care the metadata is for
> >>>>> page pool or not. We just use the space of pages like the "private".
> >>>>
> >>>> That's also fine.
> >>>>
>
> I'm not sure it is "fine" to, explicitly choosing not to use page pool,
> and then (ab)use `struct page` member (pp) that intended for page_pool
> for other stuff. (In this case create a linked list of pages).
>
>   +#define page_chain_next(p)	((struct page *)((p)->pp))
>   +#define page_chain_add(p, n)	((p)->pp = (void *)n)
>
> I'm not sure that I (as PP maintainer) can make this call actually, as I
> think this area belong with the MM "page" maintainers (Cc MM-list +
> people) to judge.
>
> Just invention new ways to use struct page fields without adding your
> use-case to struct page, will make it harder for MM people to maintain
> (e.g. make future change).
>
> --Jesper
>
>
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c22d1118a133..4446fb54de6d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -48,6 +48,9 @@  module_param(napi_tx, bool, 0644);
 
 #define VIRTIO_XDP_FLAG	BIT(0)
 
+#define page_chain_next(p)	((struct page *)((p)->pp))
+#define page_chain_add(p, n)	((p)->pp = (void *)n)
+
 /* RX packet size EWMA. The average packet size is used to determine the packet
  * buffer size when refilling RX rings. As the entire RX ring may be refilled
  * at once, the weight is chosen so that the EWMA will be insensitive to short-
@@ -191,7 +194,7 @@  struct receive_queue {
 
 	struct virtnet_interrupt_coalesce intr_coal;
 
-	/* Chain pages by the private ptr. */
+	/* Chain pages by the page's pp struct. */
 	struct page *pages;
 
 	/* Average packet length for mergeable receive buffers. */
@@ -432,16 +435,16 @@  skb_vnet_common_hdr(struct sk_buff *skb)
 }
 
 /*
- * private is used to chain pages for big packets, put the whole
- * most recent used list in the beginning for reuse
+ * put the whole most recent used list in the beginning for reuse
  */
 static void give_pages(struct receive_queue *rq, struct page *page)
 {
 	struct page *end;
 
 	/* Find end of list, sew whole thing into vi->rq.pages. */
-	for (end = page; end->private; end = (struct page *)end->private);
-	end->private = (unsigned long)rq->pages;
+	for (end = page; page_chain_next(end); end = page_chain_next(end));
+
+	page_chain_add(end, rq->pages);
 	rq->pages = page;
 }
 
@@ -450,9 +453,9 @@  static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 	struct page *p = rq->pages;
 
 	if (p) {
-		rq->pages = (struct page *)p->private;
-		/* clear private here, it is used to chain pages */
-		p->private = 0;
+		rq->pages = page_chain_next(p);
+		/* clear chain here, it is used to chain pages */
+		page_chain_add(p, NULL);
 	} else
 		p = alloc_page(gfp_mask);
 	return p;
@@ -609,7 +612,7 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 		if (unlikely(!skb))
 			return NULL;
 
-		page = (struct page *)page->private;
+		page = page_chain_next(page);
 		if (page)
 			give_pages(rq, page);
 		goto ok;
@@ -657,7 +660,7 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
 				frag_size, truesize);
 		len -= frag_size;
-		page = (struct page *)page->private;
+		page = page_chain_next(page);
 		offset = 0;
 	}
 
@@ -1909,7 +1912,7 @@  static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
 		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
 
 		/* chain new page in list head to match sg */
-		first->private = (unsigned long)list;
+		page_chain_add(first, list);
 		list = first;
 	}
 
@@ -1929,7 +1932,7 @@  static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
 	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
 
 	/* chain first in list head */
-	first->private = (unsigned long)list;
+	page_chain_add(first, list);
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, vi->big_packets_num_skbfrags + 2,
 				  first, gfp);
 	if (err < 0)