Message ID | 20210316041645.144249-1-arjunroy.kdev@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [mm,net-next,v2] mm: net: memcg accounting for TCP rx zerocopy | expand |
On Mon, Mar 15, 2021 at 9:16 PM Arjun Roy <arjunroy.kdev@gmail.com> wrote: > > From: Arjun Roy <arjunroy@google.com> > > TCP zerocopy receive is used by high performance network applications > to further scale. For RX zerocopy, the memory containing the network > data filled by the network driver is directly mapped into the address > space of high performance applications. To keep the TLB cost low, > these applications unmap the network memory in big batches. So, this > memory can remain mapped for long time. This can cause a memory > isolation issue as this memory becomes unaccounted after getting > mapped into the application address space. This patch adds the memcg > accounting for such memory. > > Accounting the network memory comes with its own unique challenges. > The high performance NIC drivers use page pooling to reuse the pages > to eliminate/reduce expensive setup steps like IOMMU. These drivers > keep an extra reference on the pages and thus we can not depend on the > page reference for the uncharging. The page in the pool may keep a > memcg pinned for arbitrary long time or may get used by other memcg. > > This patch decouples the uncharging of the page from the refcnt and > associates it with the map count i.e. the page gets uncharged when the > last address space unmaps it. Now the question is, what if the driver > drops its reference while the page is still mapped? That is fine as > the address space also holds a reference to the page i.e. the > reference count can not drop to zero before the map count. > > Signed-off-by: Arjun Roy <arjunroy@google.com> > Co-developed-by: Shakeel Butt <shakeelb@google.com> > Signed-off-by: Shakeel Butt <shakeelb@google.com> > Signed-off-by: Eric Dumazet <edumazet@google.com> > Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> > --- > > Changelog since v1: > - Pages accounted for in this manner are now tracked via MEMCG_SOCK. > - v1 allowed for a brief period of double-charging, now we have a > brief period of under-charging to avoid undue memory pressure. > > include/linux/memcontrol.h | 48 ++++++++++++- > mm/memcontrol.c | 138 +++++++++++++++++++++++++++++++++++++ > mm/rmap.c | 7 +- > net/ipv4/tcp.c | 79 +++++++++++++++++---- > 4 files changed, 253 insertions(+), 19 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index e6dc793d587d..d67bc2aec7f6 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -349,11 +349,13 @@ extern struct mem_cgroup *root_mem_cgroup; > > enum page_memcg_data_flags { > /* page->memcg_data is a pointer to an objcgs vector */ > - MEMCG_DATA_OBJCGS = (1UL << 0), > + MEMCG_DATA_OBJCGS = (1UL << 0), > /* page has been accounted as a non-slab kernel page */ > - MEMCG_DATA_KMEM = (1UL << 1), > + MEMCG_DATA_KMEM = (1UL << 1), > + /* page has been accounted as network memory */ > + MEMCG_DATA_SOCK = (1UL << 2), > /* the next bit after the last actual flag */ > - __NR_MEMCG_DATA_FLAGS = (1UL << 2), > + __NR_MEMCG_DATA_FLAGS = (1UL << 3), > }; > > #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) > @@ -444,6 +446,11 @@ static inline bool PageMemcgKmem(struct page *page) > return page->memcg_data & MEMCG_DATA_KMEM; > } > > +static inline bool PageMemcgSock(struct page *page) > +{ > + return page->memcg_data & MEMCG_DATA_SOCK; > +} > + > #ifdef CONFIG_MEMCG_KMEM > /* > * page_objcgs - get the object cgroups vector associated with a page > @@ -1093,6 +1100,11 @@ static inline bool PageMemcgKmem(struct page *page) > return false; > } > > +static inline bool PageMemcgSock(struct page *page) > +{ > + return false; > +} > + > static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) > { > return true; > @@ -1554,6 +1566,15 @@ extern struct static_key_false memcg_sockets_enabled_key; > #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) > void mem_cgroup_sk_alloc(struct sock *sk); > void mem_cgroup_sk_free(struct sock *sk); > + > +void mem_cgroup_post_charge_sock_pages(struct mem_cgroup *memcg, > + unsigned int nr_pages); > +void mem_cgroup_uncharge_sock_page(struct page *page); > +int mem_cgroup_prepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, > + u8 *page_prepared, unsigned int nr_pages); > +int mem_cgroup_unprepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, > + u8 *page_prepared, unsigned int nr_pages); > + > static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) > { > if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) > @@ -1582,6 +1603,27 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, > int nid, int shrinker_id) > { > } > + > +static inline void mem_cgroup_post_charge_sock_pages(struct mem_cgroup *memcg, > + unsigned int nr_pages) > +{ > +} > + > +static inline void mem_cgroup_uncharge_sock_page(struct page *page) > +{ > +} > + > +static inline int mem_cgroup_prepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, > + u8 *page_prepared, unsigned int nr_pages) > +{ > + return 0; > +} > + > +static inline int mem_cgroup_unprepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, > + u8 *page_prepared, unsigned int nr_pages) > +{ > + return 0; > +} > #endif > > #ifdef CONFIG_MEMCG_KMEM > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 845eec01ef9d..aee126b0028c 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -7027,6 +7027,144 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) > refill_stock(memcg, nr_pages); > } > > +/** > + * mem_cgroup_post_charge_sock_pages - charge socket memory > + * for zerocopy pages mapped to userspace. > + * @memcg: memcg to charge > + * @nr_pages: number of pages > + * > + * When we perform a zero-copy receive on a socket, the receive payload memory > + * pages are remapped to the user address space with vm_insert_pages(). > + * mem_cgroup_post_charge_sock_pages() accounts for this memory utilization; it is > + * *not* mem_cgroup_charge_skmem() which accounts for memory use within socket > + * buffers. > + * > + * Charges all @nr_pages to given memcg. The caller should have a reference > + * on the given memcg. Unlike mem_cgroup_charge_skmem, the caller must also have > + * set page->memcg_data for these pages beforehand. Decoupling this page > + * manipulation from the charging allows us to avoid double-charging the pages, > + * causing undue memory pressure. > + */ > +void mem_cgroup_post_charge_sock_pages(struct mem_cgroup *memcg, > + unsigned int nr_pages) > +{ > + if (mem_cgroup_disabled() || !memcg) > + return; > + try_charge(memcg, GFP_KERNEL | __GFP_NOFAIL, nr_pages); > + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); > +} > + > +/** > + * mem_cgroup_uncharge_sock_page - uncharge socket page > + * when unmapping zerocopy pages mapped to userspace. > + * @page: page to uncharge > + * > + * mem_cgroup_uncharge_sock_page() drops the CSS reference taken by > + * mem_cgroup_prepare_sock_pages(), and uncharges memory usage. > + * Page cannot be compound. Must be called with page memcg lock held. > + */ > +void mem_cgroup_uncharge_sock_page(struct page *page) > +{ > + struct mem_cgroup *memcg; > + > + VM_BUG_ON(PageCompound(page)); > + memcg = page_memcg(page); > + if (!memcg) > + return; > + > + mod_memcg_state(memcg, MEMCG_SOCK, -1); > + refill_stock(memcg, 1); > + css_put(&memcg->css); > + page->memcg_data = 0; > +} > + > +/** > + * mem_cgroup_unprepare_sock_pages - unset memcg for unremapped zerocopy pages. > + * @memcg: The memcg we were trying to account pages to. > + * @pages: array of pages, some subset of which we must 'unprepare' > + * @page_prepared: array of flags indicating if page must be unprepared > + * @nr_pages: Length of pages and page_prepared arrays. > + * > + * If a zerocopy receive attempt failed to remap some pages to userspace, we > + * must unset memcg on these pages, if we had previously set them with a > + * matching call to mem_cgroup_prepare_sock_pages(). > + * > + * The caller must ensure that all input parameters are the same parameters > + * (or a subset of the parameters) passed to the preceding call to > + * mem_cgroup_prepare_sock_pages() otherwise undefined behaviour results. > + * Returns how many pages were unprepared so caller post-charges the right > + * amount of pages to the memcg. > + */ > +int mem_cgroup_unprepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, > + u8 *page_prepared, unsigned int nr_pages) > +{ > + unsigned int idx, cleared = 0; > + > + if (!memcg || mem_cgroup_disabled()) > + return 0; > + > + for (idx = 0; idx < nr_pages; ++idx) { > + if (!page_prepared[idx]) > + continue; > + /* We prepared this page so it is not LRU. */ > + WRITE_ONCE(pages[idx]->memcg_data, 0); > + ++cleared; > + } > + css_put_many(&memcg->css, cleared); > + return cleared; > +} > + > +/** > + * mem_cgroup_prepare_sock_pages - set memcg for receive zerocopy pages. > + * @memcg: The memcg we were trying to account pages to. > + * @pages: array of pages, some subset of which we must 'prepare' > + * @page_prepared: array of flags indicating if page was prepared > + * @nr_pages: Length of pages and page_prepared arrays. > + * > + * If we are to remap pages to userspace in zerocopy receive, we must set the > + * memcg for these pages before vm_insert_pages(), if the page does not already > + * have a memcg set. However, if a memcg was already set, we do not adjust it. > + * Explicitly track which pages we have prepared ourselves so we can 'unprepare' > + * them if need be should page remapping fail. > + * > + * The caller must ensure that all input parameter passed to a subsequent call > + * to mem_cgroup_unprepare_sock_pages() are identical or a subset of these > + * parameters otherwise undefined behaviour results. page_prepared must also be > + * zero'd by the caller before invoking this method. Returns how many pages were > + * prepared so caller post-charges the right amount of pages to the memcg. > + */ > +int mem_cgroup_prepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, > + u8 *page_prepared, unsigned int nr_pages) > +{ > + unsigned int idx, to_uncharge = 0; > + const unsigned long memcg_data = (unsigned long) memcg | > + MEMCG_DATA_SOCK; > + > + if (!memcg || mem_cgroup_disabled()) > + return 0; > + > + css_get_many(&memcg->css, nr_pages); > + for (idx = 0; idx < nr_pages; ++idx) { > + struct page *page = pages[idx]; > + > + VM_BUG_ON(PageCompound(page)); > + /* > + * page->memcg_data == 0 implies non-LRU page. non-LRU pages are > + * not changed by the rest of the kernel, so we only have to > + * guard against concurrent access in the networking layer. > + * cmpxchg(0) lets us both validate non-LRU and protects against > + * concurrent access in networking layer. > + */ > + if (cmpxchg(&page->memcg_data, 0, memcg_data) == 0) > + page_prepared[idx] = 1; > + else > + ++to_uncharge; > + } > + if (to_uncharge) > + css_put_many(&memcg->css, to_uncharge); > + return nr_pages - to_uncharge; > +} > + > static int __init cgroup_memory(char *s) > { > char *token; > diff --git a/mm/rmap.c b/mm/rmap.c > index b0fc27e77d6d..d2a164769dcf 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -1290,6 +1290,9 @@ static void page_remove_file_rmap(struct page *page, bool compound) > > if (unlikely(PageMlocked(page))) > clear_page_mlock(page); > + > + if (unlikely(PageMemcgSock(page))) > + mem_cgroup_uncharge_sock_page(page); > } > > static void page_remove_anon_compound_rmap(struct page *page) > @@ -1345,7 +1348,7 @@ static void page_remove_anon_compound_rmap(struct page *page) > */ > void page_remove_rmap(struct page *page, bool compound) > { > - lock_page_memcg(page); > + struct mem_cgroup *memcg = lock_page_memcg(page); > > if (!PageAnon(page)) { > page_remove_file_rmap(page, compound); > @@ -1384,7 +1387,7 @@ void page_remove_rmap(struct page *page, bool compound) > * faster for those pages still in swapcache. > */ > out: > - unlock_page_memcg(page); > + __unlock_page_memcg(memcg); > } > > /* > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index de7cc8445ac0..17dd5b57409f 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -1789,12 +1789,14 @@ static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, > ++frag; > } > *offset_frag = 0; > + prefetchw(skb_frag_page(frag)); > return frag; > } > > static bool can_map_frag(const skb_frag_t *frag) > { > - return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag); > + return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag) && > + !PageCompound(skb_frag_page(frag)); > } > > static int find_next_mappable_frag(const skb_frag_t *frag, > @@ -1944,14 +1946,19 @@ static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, > > static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, > struct page **pending_pages, > - unsigned long pages_remaining, > + u8 *page_prepared, > + unsigned long leftover_pages, > unsigned long *address, > u32 *length, > u32 *seq, > struct tcp_zerocopy_receive *zc, > u32 total_bytes_to_map, > - int err) > + int err, > + unsigned long *pages_acctd_total, > + struct mem_cgroup *memcg) > { > + unsigned long pages_remaining = leftover_pages, pages_mapped = 0; > + > /* At least one page did not map. Try zapping if we skipped earlier. */ > if (err == -EBUSY && > zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { > @@ -1965,14 +1972,14 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, > } > > if (!err) { > - unsigned long leftover_pages = pages_remaining; > int bytes_mapped; > > /* We called zap_page_range, try to reinsert. */ > err = vm_insert_pages(vma, *address, > pending_pages, > &pages_remaining); > - bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); > + pages_mapped = leftover_pages - pages_remaining; > + bytes_mapped = PAGE_SIZE * pages_mapped; > *seq += bytes_mapped; > *address += bytes_mapped; > } > @@ -1986,24 +1993,41 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, > > *length -= bytes_not_mapped; > zc->recv_skip_hint += bytes_not_mapped; > + > + pending_pages += pages_mapped; > + page_prepared += pages_mapped; > + > + /* For the pages that did not map, unset memcg and drop refs. */ > + *pages_acctd_total -= mem_cgroup_unprepare_sock_pages(memcg, > + pending_pages, > + page_prepared, > + pages_remaining); > } > return err; > } > > static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, > struct page **pages, > + u8 *page_prepared, > unsigned int pages_to_map, > unsigned long *address, > u32 *length, > u32 *seq, > struct tcp_zerocopy_receive *zc, > - u32 total_bytes_to_map) > + u32 total_bytes_to_map, > + unsigned long *pages_acctd_total, > + struct mem_cgroup *memcg) > { > unsigned long pages_remaining = pages_to_map; > unsigned int pages_mapped; > unsigned int bytes_mapped; > int err; > > + /* Before mapping, we must take css ref since unmap drops it. */ > + *pages_acctd_total = mem_cgroup_prepare_sock_pages(memcg, pages, > + page_prepared, > + pages_to_map); > + > err = vm_insert_pages(vma, *address, pages, &pages_remaining); > pages_mapped = pages_to_map - (unsigned int)pages_remaining; > bytes_mapped = PAGE_SIZE * pages_mapped; > @@ -2018,8 +2042,9 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, > > /* Error: maybe zap and retry + rollback state for failed inserts. */ > return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, > + page_prepared + pages_mapped, > pages_remaining, address, length, seq, zc, total_bytes_to_map, > - err); > + err, pages_acctd_total, memcg); > } > > #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) > @@ -2058,6 +2083,8 @@ static int tcp_zerocopy_receive(struct sock *sk, > u32 length = 0, offset, vma_len, avail_len, copylen = 0; > unsigned long address = (unsigned long)zc->address; > struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; > + u8 page_prepared[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; > + unsigned long total_pages_acctd = 0; > s32 copybuf_len = zc->copybuf_len; > struct tcp_sock *tp = tcp_sk(sk); > const skb_frag_t *frags = NULL; > @@ -2065,6 +2092,7 @@ static int tcp_zerocopy_receive(struct sock *sk, > struct vm_area_struct *vma; > struct sk_buff *skb = NULL; > u32 seq = tp->copied_seq; > + struct mem_cgroup *memcg; > u32 total_bytes_to_map; > int inq = tcp_inq(sk); > int ret; > @@ -2110,12 +2138,15 @@ static int tcp_zerocopy_receive(struct sock *sk, > zc->length = avail_len; > zc->recv_skip_hint = avail_len; > } > + > + memcg = get_mem_cgroup_from_mm(current->mm); > ret = 0; > while (length + PAGE_SIZE <= zc->length) { > + bool skb_remainder_unmappable = zc->recv_skip_hint < PAGE_SIZE; > int mappable_offset; > struct page *page; > > - if (zc->recv_skip_hint < PAGE_SIZE) { > + if (skb_remainder_unmappable) { > u32 offset_frag; > > if (skb) { > @@ -2144,30 +2175,46 @@ static int tcp_zerocopy_receive(struct sock *sk, > break; > } > page = skb_frag_page(frags); > - prefetchw(page); > pages[pages_to_map++] = page; > length += PAGE_SIZE; > zc->recv_skip_hint -= PAGE_SIZE; > frags++; > + skb_remainder_unmappable = zc->recv_skip_hint < PAGE_SIZE; > if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || > - zc->recv_skip_hint < PAGE_SIZE) { > + skb_remainder_unmappable) { > + unsigned long pages_acctd = 0; > + > /* Either full batch, or we're about to go to next skb > * (and we cannot unroll failed ops across skbs). > */ > + memset(page_prepared, 0, sizeof(page_prepared)); > ret = tcp_zerocopy_vm_insert_batch(vma, pages, > + page_prepared, > pages_to_map, > &address, &length, > &seq, zc, > - total_bytes_to_map); > + total_bytes_to_map, > + &pages_acctd, > + memcg); > + total_pages_acctd += pages_acctd; > if (ret) > goto out; > pages_to_map = 0; > } > + if (!skb_remainder_unmappable) > + prefetchw(skb_frag_page(frags)); > } > if (pages_to_map) { > - ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, > - &address, &length, &seq, > - zc, total_bytes_to_map); > + unsigned long pages_acctd = 0; > + > + memset(page_prepared, 0, sizeof(page_prepared)); > + ret = tcp_zerocopy_vm_insert_batch(vma, pages, > + page_prepared, > + pages_to_map, &address, > + &length, &seq, zc, > + total_bytes_to_map, > + &pages_acctd, memcg); > + total_pages_acctd += pages_acctd; > } > out: > mmap_read_unlock(current->mm); > @@ -2190,6 +2237,10 @@ static int tcp_zerocopy_receive(struct sock *sk, > ret = -EIO; > } > zc->length = length; > + > + /* Account pages to user. */ > + mem_cgroup_post_charge_sock_pages(memcg, total_pages_acctd); > + mem_cgroup_put(memcg); > return ret; > } > #endif > -- > 2.31.0.rc2.261.g7f71774620-goog > Apologies for the spam - looks like I forgot to rebase the first time I sent this out. Actually, on a related note, it's not 100% clear to me whether this patch (which in its current form, applies to net-next) should instead be based on the mm branch - but the most recent (clean) checkout of mm fails to build for me so net-next for now. -Arjun
On Mon, Mar 15, 2021 at 9:20 PM Arjun Roy <arjunroy@google.com> wrote: > [...] > > > > Apologies for the spam - looks like I forgot to rebase the first time > I sent this out. > > Actually, on a related note, it's not 100% clear to me whether this > patch (which in its current form, applies to net-next) should instead > be based on the mm branch - but the most recent (clean) checkout of mm > fails to build for me so net-next for now. > It is due to "mm: page-writeback: simplify memcg handling in test_clear_page_writeback()" patch in the mm tree. You would need to reintroduce the lock_page_memcg() which returns the memcg and make __unlock_page_memcg() non-static.
On Mon, Mar 15, 2021 at 9:29 PM Shakeel Butt <shakeelb@google.com> wrote: > > On Mon, Mar 15, 2021 at 9:20 PM Arjun Roy <arjunroy@google.com> wrote: > > > [...] > > > > > > > Apologies for the spam - looks like I forgot to rebase the first time > > I sent this out. > > > > Actually, on a related note, it's not 100% clear to me whether this > > patch (which in its current form, applies to net-next) should instead > > be based on the mm branch - but the most recent (clean) checkout of mm > > fails to build for me so net-next for now. > > > > It is due to "mm: page-writeback: simplify memcg handling in > test_clear_page_writeback()" patch in the mm tree. You would need to > reintroduce the lock_page_memcg() which returns the memcg and make > __unlock_page_memcg() non-static. To clarify, Shakeel - the tag "tag: v5.12-rc2-mmots-2021-03-11-21-49" fails to build on a clean checkout, without this patch, due to a compilation failure in mm/shmem.c, for reference: https://pastebin.com/raw/12eSGdGD (and that's why I'm basing this patch off of net-next in this email). -Arjun
On Mon, Mar 15, 2021 at 11:22 PM Arjun Roy <arjunroy@google.com> wrote: > > On Mon, Mar 15, 2021 at 9:29 PM Shakeel Butt <shakeelb@google.com> wrote: > > > > On Mon, Mar 15, 2021 at 9:20 PM Arjun Roy <arjunroy@google.com> wrote: > > > > > [...] > > > > > > > > > > Apologies for the spam - looks like I forgot to rebase the first time > > > I sent this out. > > > > > > Actually, on a related note, it's not 100% clear to me whether this > > > patch (which in its current form, applies to net-next) should instead > > > be based on the mm branch - but the most recent (clean) checkout of mm > > > fails to build for me so net-next for now. > > > > > > > It is due to "mm: page-writeback: simplify memcg handling in > > test_clear_page_writeback()" patch in the mm tree. You would need to > > reintroduce the lock_page_memcg() which returns the memcg and make > > __unlock_page_memcg() non-static. > > To clarify, Shakeel - the tag "tag: v5.12-rc2-mmots-2021-03-11-21-49" > fails to build on a clean checkout, without this patch, due to a > compilation failure in mm/shmem.c, for reference: > https://pastebin.com/raw/12eSGdGD > (and that's why I'm basing this patch off of net-next in this email). > > -Arjun Another seeming anomaly - the patch sent out passes scripts/checkpatch.pl but netdev/checkpatch finds plenty of actionable fixes here: https://patchwork.kernel.org/project/netdevbpf/patch/20210316041645.144249-1-arjunroy.kdev@gmail.com/ Is netdev using some other automated checker instead of scripts/checkpatch.pl? -Arjun
Hello, On Mon, Mar 15, 2021 at 09:16:45PM -0700, Arjun Roy wrote: > From: Arjun Roy <arjunroy@google.com> > > TCP zerocopy receive is used by high performance network applications > to further scale. For RX zerocopy, the memory containing the network > data filled by the network driver is directly mapped into the address > space of high performance applications. To keep the TLB cost low, > these applications unmap the network memory in big batches. So, this > memory can remain mapped for long time. This can cause a memory > isolation issue as this memory becomes unaccounted after getting > mapped into the application address space. This patch adds the memcg > accounting for such memory. > > Accounting the network memory comes with its own unique challenges. > The high performance NIC drivers use page pooling to reuse the pages > to eliminate/reduce expensive setup steps like IOMMU. These drivers > keep an extra reference on the pages and thus we can not depend on the > page reference for the uncharging. The page in the pool may keep a > memcg pinned for arbitrary long time or may get used by other memcg. The page pool knows when a page is unmapped again and becomes available for recycling, right? Essentially the 'free' phase of that private allocator. That's where the uncharge should be done. For one, it's more aligned with the usual memcg charge lifetime rules. But also it doesn't add what is essentially a private driver callback to the generic file unmapping path. Finally, this will eliminate the need for making up a new charge type (MEMCG_DATA_SOCK) and allow using the standard kmem charging API. > This patch decouples the uncharging of the page from the refcnt and > associates it with the map count i.e. the page gets uncharged when the > last address space unmaps it. Now the question is, what if the driver > drops its reference while the page is still mapped? That is fine as > the address space also holds a reference to the page i.e. the > reference count can not drop to zero before the map count. > > Signed-off-by: Arjun Roy <arjunroy@google.com> > Co-developed-by: Shakeel Butt <shakeelb@google.com> > Signed-off-by: Shakeel Butt <shakeelb@google.com> > Signed-off-by: Eric Dumazet <edumazet@google.com> > Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> > --- > > Changelog since v1: > - Pages accounted for in this manner are now tracked via MEMCG_SOCK. > - v1 allowed for a brief period of double-charging, now we have a > brief period of under-charging to avoid undue memory pressure. I'm afraid we'll have to go back to v1. Let's address the issues raised with it: 1. The NR_FILE_MAPPED accounting. It is longstanding Linux behavior that driver pages mapped into userspace are accounted as file pages, because userspace is actually doing mmap() against a driver file/fd (as opposed to an anon mmap). That is how they show up in vmstat, in meminfo, and in the per process stats. There is no reason to make memcg deviate from this. If we don't like it, it should be taken on by changing vm_insert_page() - not trick rmap into thinking these arent memcg pages and then fixing it up with additional special-cased accounting callbacks. v1 did this right, it charged the pages the way we handle all other userspace pages: before rmap, and then let the generic VM code do the accounting for us with the cgroup-aware vmstat infrastructure. 2. The double charging. Could you elaborate how much we're talking about in any given batch? Is this a problem worth worrying about? The way I see it, any conflict here is caused by the pages being counted in the SOCK counter already, but not actually *tracked* on a per page basis. If it's worth addressing, we should look into fixing the root cause over there first if possible, before trying to work around it here. The newly-added GFP_NOFAIL is especially worrisome. The pages should be charged before we make promises to userspace, not be force-charged when it's too late. We have sk context when charging the inserted pages. Can we uncharge MEMCG_SOCK after each batch of inserts? That's only 32 pages worth of overcharging, so not more than the regular charge batch memcg is using. An even better way would be to do charge stealing where we reuse the existing MEMCG_SOCK charges and don't have to get any new ones at all - just set up page->memcg and remove the charge from the sk. But yeah, it depends a bit if this is a practical concern. Thanks, Johannes
On Mon, 15 Mar 2021 23:28:08 -0700 Arjun Roy wrote: > On Mon, Mar 15, 2021 at 11:22 PM Arjun Roy <arjunroy@google.com> wrote: > > > > On Mon, Mar 15, 2021 at 9:29 PM Shakeel Butt <shakeelb@google.com> wrote: > > > > > > On Mon, Mar 15, 2021 at 9:20 PM Arjun Roy <arjunroy@google.com> wrote: > [...] > > > [...] > [...] > [...] > > > > > > It is due to "mm: page-writeback: simplify memcg handling in > > > test_clear_page_writeback()" patch in the mm tree. You would need to > > > reintroduce the lock_page_memcg() which returns the memcg and make > > > __unlock_page_memcg() non-static. > > > > To clarify, Shakeel - the tag "tag: v5.12-rc2-mmots-2021-03-11-21-49" > > fails to build on a clean checkout, without this patch, due to a > > compilation failure in mm/shmem.c, for reference: > > https://pastebin.com/raw/12eSGdGD > > (and that's why I'm basing this patch off of net-next in this email). > > > > -Arjun > > Another seeming anomaly - the patch sent out passes > scripts/checkpatch.pl but netdev/checkpatch finds plenty of actionable > fixes here: https://patchwork.kernel.org/project/netdevbpf/patch/20210316041645.144249-1-arjunroy.kdev@gmail.com/ > > Is netdev using some other automated checker instead of scripts/checkpatch.pl? --strict --max-line-length=80
On Tue, Mar 16, 2021 at 3:27 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > Hello, > > On Mon, Mar 15, 2021 at 09:16:45PM -0700, Arjun Roy wrote: > > From: Arjun Roy <arjunroy@google.com> > > > > TCP zerocopy receive is used by high performance network applications > > to further scale. For RX zerocopy, the memory containing the network > > data filled by the network driver is directly mapped into the address > > space of high performance applications. To keep the TLB cost low, > > these applications unmap the network memory in big batches. So, this > > memory can remain mapped for long time. This can cause a memory > > isolation issue as this memory becomes unaccounted after getting > > mapped into the application address space. This patch adds the memcg > > accounting for such memory. > > > > Accounting the network memory comes with its own unique challenges. > > The high performance NIC drivers use page pooling to reuse the pages > > to eliminate/reduce expensive setup steps like IOMMU. These drivers > > keep an extra reference on the pages and thus we can not depend on the > > page reference for the uncharging. The page in the pool may keep a > > memcg pinned for arbitrary long time or may get used by other memcg. > > The page pool knows when a page is unmapped again and becomes > available for recycling, right? Essentially the 'free' phase of that > private allocator. That's where the uncharge should be done. > In general, no it does not. The page pool, how it operates and whether it exists in the first place, is an optimization that a given NIC driver can choose to make - and so there's no generic plumbing that ties page unmap events to something that a page pool could subscribe to that I am aware of. All it can do is check, at a given point, whether it can reuse a page or not, typically by checking the current page refcount. A couple of examples for drivers with such a mechanism - mlx5: (https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L248) Or intel fm10k: (https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/intel/fm10k/fm10k_main.c#L207) Note that typically map count is not checked (maybe because page-flipping receive zerocopy did not exist as a consideration when the driver was written). So given that the page pool is essentially checking on demand for whether a page is usable or not - since there is no specific plumbing invoked when a page is usable again (i.e. unmapped, in this case) - we opted to hook into when the mapcount is decremented inside unmap() path. > For one, it's more aligned with the usual memcg charge lifetime rules. > > But also it doesn't add what is essentially a private driver callback > to the generic file unmapping path. > I understand the concern, and share it - the specific thing we'd like to avoid is to have driver specific code in the unmap path, and not in the least because any given driver could do its own thing. Rather, we consider this mechanism that we added as generic to zerocopy network data reception - that it does the right thing, no matter what the driver is doing. This would be transparent to the driver, in other words - all the driver has to do is to continue doing what it was before, using page->refcnt == 1 to decide whether it can use a page or if it is not already in use. Consider this instead as a broadly applicable networking feature adding a callback to the unmap path, instead of a particular driver. And while it is just TCP at present, it fundamentally isn't limited to TCP. I do have a request for clarification, if you could specify the usual memcg charge lifetime rules that you are referring to here? Just to make sure we're on the same page. > Finally, this will eliminate the need for making up a new charge type > (MEMCG_DATA_SOCK) and allow using the standard kmem charging API. > > > This patch decouples the uncharging of the page from the refcnt and > > associates it with the map count i.e. the page gets uncharged when the > > last address space unmaps it. Now the question is, what if the driver > > drops its reference while the page is still mapped? That is fine as > > the address space also holds a reference to the page i.e. the > > reference count can not drop to zero before the map count. > > > > Signed-off-by: Arjun Roy <arjunroy@google.com> > > Co-developed-by: Shakeel Butt <shakeelb@google.com> > > Signed-off-by: Shakeel Butt <shakeelb@google.com> > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com> > > --- > > > > Changelog since v1: > > - Pages accounted for in this manner are now tracked via MEMCG_SOCK. > > - v1 allowed for a brief period of double-charging, now we have a > > brief period of under-charging to avoid undue memory pressure. > > I'm afraid we'll have to go back to v1. > > Let's address the issues raised with it: > > 1. The NR_FILE_MAPPED accounting. It is longstanding Linux behavior > that driver pages mapped into userspace are accounted as file > pages, because userspace is actually doing mmap() against a driver > file/fd (as opposed to an anon mmap). That is how they show up in > vmstat, in meminfo, and in the per process stats. There is no > reason to make memcg deviate from this. If we don't like it, it > should be taken on by changing vm_insert_page() - not trick rmap > into thinking these arent memcg pages and then fixing it up with > additional special-cased accounting callbacks. > > v1 did this right, it charged the pages the way we handle all other > userspace pages: before rmap, and then let the generic VM code do > the accounting for us with the cgroup-aware vmstat infrastructure. To clarify, are you referring to the v1 approach for this patch from a few weeks ago? (i.e. charging for the page before vm_insert_page()). This patch changes when the charging happens, and, as you note, makes it a forced charge since we've already inserted the mappings into the user's address space - but it isn't otherwise fundamentally different from v1 in what it does. And unmap is the same. > > 2. The double charging. Could you elaborate how much we're talking > about in any given batch? Is this a problem worth worrying about? > The period of double counting in v1 of this patch was from around the time we do vm_insert_page() (note that the pages were accounted just prior to being inserted) till the struct sk_buff's were disposed of - for an skb that's up to 45 pages. But note that is for one socket, and there can be quite a lot of open sockets and depending on what happens in terms of scheduling the period of time we're double counting can be a bit high. v1 patch series had some discussion on this: https://www.spinics.net/lists/cgroups/msg27665.html which is why we have post-charging in v2. > > The way I see it, any conflict here is caused by the pages being > counted in the SOCK counter already, but not actually *tracked* on > a per page basis. If it's worth addressing, we should look into > fixing the root cause over there first if possible, before trying > to work around it here. > When you say tracked on a per-page basis, I assume you mean using the usual mechanism where a page has a non-null memcg set, with unaccounting occuring when the refcount goes to 0. Networking currently will account/unaccount bytes just based on a page count (and the memcg set in struct sock) rather than setting it in the page itself - because the recycling of pages means the next time around it could be charged to another memcg. And the refcount never goes to 1 due to the pooling (in the absence of eviction for busy pages during packet reception). When sitting in the driver page pool, non-null memcg does not work since we do not know which socket (thus which memcg) the page would be destined for since we do not know whose packet arrives next. The page pooling does make this all this a bit awkward, and the approach in this patch seems to me the cleanest solution. > > > The newly-added GFP_NOFAIL is especially worrisome. The pages > should be charged before we make promises to userspace, not be > force-charged when it's too late. > > We have sk context when charging the inserted pages. Can we > uncharge MEMCG_SOCK after each batch of inserts? That's only 32 > pages worth of overcharging, so not more than the regular charge > batch memcg is using. Right now sock uncharging is happening as we cleanup the skb, which can have up to 45 pages. So if we tried uncharging per SKB we'd then have up to 45 pages worth of overcharging per socket, multiplied by however many sockets are currently being overcharged in this manner. > > An even better way would be to do charge stealing where we reuse > the existing MEMCG_SOCK charges and don't have to get any new ones > at all - just set up page->memcg and remove the charge from the sk. > That gets a bit complicated since we have to be careful with forward allocs in the socket layer - and the bookkeeping gets a bit complicated since now for the struct sock we'll need to track how many bytes were 'stolen' and update all the code where socket accounting happens. It ends up being a larger/messier code change then. Thanks, -Arjun > > > > But yeah, it depends a bit if this is a practical concern. > > Thanks, > Johannes
On Tue, Mar 16, 2021 at 11:05:11PM -0700, Arjun Roy wrote: > On Tue, Mar 16, 2021 at 3:27 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > Hello, > > > > On Mon, Mar 15, 2021 at 09:16:45PM -0700, Arjun Roy wrote: > > > From: Arjun Roy <arjunroy@google.com> > > > > > > TCP zerocopy receive is used by high performance network applications > > > to further scale. For RX zerocopy, the memory containing the network > > > data filled by the network driver is directly mapped into the address > > > space of high performance applications. To keep the TLB cost low, > > > these applications unmap the network memory in big batches. So, this > > > memory can remain mapped for long time. This can cause a memory > > > isolation issue as this memory becomes unaccounted after getting > > > mapped into the application address space. This patch adds the memcg > > > accounting for such memory. > > > > > > Accounting the network memory comes with its own unique challenges. > > > The high performance NIC drivers use page pooling to reuse the pages > > > to eliminate/reduce expensive setup steps like IOMMU. These drivers > > > keep an extra reference on the pages and thus we can not depend on the > > > page reference for the uncharging. The page in the pool may keep a > > > memcg pinned for arbitrary long time or may get used by other memcg. > > > > The page pool knows when a page is unmapped again and becomes > > available for recycling, right? Essentially the 'free' phase of that > > private allocator. That's where the uncharge should be done. > > > > In general, no it does not. The page pool, how it operates and whether it > exists in the first place, is an optimization that a given NIC driver can choose > to make - and so there's no generic plumbing that ties page unmap events to > something that a page pool could subscribe to that I am aware of. All it can do > is check, at a given point, whether it can reuse a page or not, typically by > checking the current page refcount. > > A couple of examples for drivers with such a mechanism - mlx5: > (https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L248) if (page_ref_count(cache->page_cache[cache->head].page) != 1) So IIUC it co-opts the page count used by the page allocator, offset by the base ref of the pool. That means it doesn't control the put path and won't be aware of when pages are used and unused. How does it free those pages back to the system eventually? Does it just do a series of put_page() on pages in the pool when something determines it is too big? However it does it, it found some way to live without a destructor. But now we need one. > Or intel fm10k: > (https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/intel/fm10k/fm10k_main.c#L207) > > Note that typically map count is not checked (maybe because page-flipping > receive zerocopy did not exist as a consideration when the driver was written). > > So given that the page pool is essentially checking on demand for whether a page > is usable or not - since there is no specific plumbing invoked when a page is > usable again (i.e. unmapped, in this case) - we opted to hook into when the > mapcount is decremented inside unmap() path. The problem is that the page isn't reusable just because it's unmapped. The user may have vmspliced those pages into a pipe and be pinning them long after the unmap. I don't know whether that happens in real life, but it's a legitimate thing to do. At the very least it would be an avenue for abuse. > > For one, it's more aligned with the usual memcg charge lifetime rules. > > > > But also it doesn't add what is essentially a private driver callback > > to the generic file unmapping path. > > > > I understand the concern, and share it - the specific thing we'd like to avoid > is to have driver specific code in the unmap path, and not in the least because > any given driver could do its own thing. > > Rather, we consider this mechanism that we added as generic to zerocopy network > data reception - that it does the right thing, no matter what the driver is > doing. This would be transparent to the driver, in other words - all the driver > has to do is to continue doing what it was before, using page->refcnt == 1 to > decide whether it can use a page or if it is not already in use. > > > Consider this instead as a broadly applicable networking feature adding a > callback to the unmap path, instead of a particular driver. And while it is just > TCP at present, it fundamentally isn't limited to TCP. > > I do have a request for clarification, if you could specify the usual memcg > charge lifetime rules that you are referring to here? Just to make sure we're on > the same page. Usually, the charge lifetime is tied to the allocation lifetime. It ends when the object is fed back to whatever pool it came out of, its data is considered invalid, and its memory is available to new allocs. This isn't the case here, and as per above you may uncharge the page long before the user actually relinquishes it. > > 1. The NR_FILE_MAPPED accounting. It is longstanding Linux behavior > > that driver pages mapped into userspace are accounted as file > > pages, because userspace is actually doing mmap() against a driver > > file/fd (as opposed to an anon mmap). That is how they show up in > > vmstat, in meminfo, and in the per process stats. There is no > > reason to make memcg deviate from this. If we don't like it, it > > should be taken on by changing vm_insert_page() - not trick rmap > > into thinking these arent memcg pages and then fixing it up with > > additional special-cased accounting callbacks. > > > > v1 did this right, it charged the pages the way we handle all other > > userspace pages: before rmap, and then let the generic VM code do > > the accounting for us with the cgroup-aware vmstat infrastructure. > > To clarify, are you referring to the v1 approach for this patch from a > few weeks ago? Yes. > (i.e. charging for the page before vm_insert_page()). This patch changes when > the charging happens, and, as you note, makes it a forced charge since we've > already inserted the mappings into the user's address space - but it isn't > otherwise fundamentally different from v1 in what it does. And unmap is the > same. Right. The places where it IS different are the problem ;) Working around native VM accounting; adding custom accounting that is inconsistent with the rest of the system; force-charging a quantity of memory that the container may not be entitled to. Please revert back to precharging like in v1. > The period of double counting in v1 of this patch was from around the time we do > vm_insert_page() (note that the pages were accounted just prior to being > inserted) till the struct sk_buff's were disposed of - for an skb > that's up to 45 pages. That's seems fine. Can there be instances where the buffers are pinned by something else for indefinite lengths of time? > But note that is for one socket, and there can be quite a lot of open > sockets and > depending on what happens in terms of scheduling the period of time we're > double counting can be a bit high. You mean thread/CPU scheduling? If it comes down to that, I'm not convinced this is a practical concern at this point. I think we can assume that the number of sockets and the number of concurrent threads going through the receive path at any given time will scale with the size of the cgroup. And even a thousand threads reading from a thousand sockets at exactly the same time would double charge a maximum of 175M for a brief period of time. Since few people have 1,000 CPUs the possible overlap is further reduced. This isn't going to be a problem in the real world. > > The way I see it, any conflict here is caused by the pages being > > counted in the SOCK counter already, but not actually *tracked* on > > a per page basis. If it's worth addressing, we should look into > > fixing the root cause over there first if possible, before trying > > to work around it here. > > > > When you say tracked on a per-page basis, I assume you mean using the usual > mechanism where a page has a non-null memcg set, with unaccounting occuring when > the refcount goes to 0. Right. > Networking currently will account/unaccount bytes just based on a > page count (and the memcg set in struct sock) rather than setting it in the page > itself - because the recycling of pages means the next time around it could be > charged to another memcg. And the refcount never goes to 1 due to the pooling > (in the absence of eviction for busy pages during packet reception). When > sitting in the driver page pool, non-null memcg does not work since we do not > know which socket (thus which memcg) the page would be destined for since we do > not know whose packet arrives next. > > The page pooling does make this all this a bit awkward, and the approach > in this patch seems to me the cleanest solution. I don't think it's clean, but as per above it also isn't complete. What's necessary is to give the network page pool a hookup to the page's put path so it knows when pages go unused. This would actually be great not just for this patch, but also for accounting the amount of memory consumed by the network stack in general, at the system level. This can be a sizable chunk these days, especially with some of the higher end nics. Users currently have no idea where their memory is going. And it seems it couldn't even answer this question right now without scanning the entire pool. It'd be great if it could track used and cached pages and report to vmstat. It would also be good if it could integrate with the page reclaim path and return any unused pages when the system is struggling with memory pressure. I don't see how it could easily/predictably do that without a good handle on what is and isn't used. These are all upsides even before your patch. Let's stop working around this quirk in the page pool, we've clearly run into the limit of this implementation. Here is an idea of how it could work: struct page already has struct { /* page_pool used by netstack */ /** * @dma_addr: might require a 64-bit value even on * 32-bit architectures. */ dma_addr_t dma_addr; }; and as you can see from its union neighbors, there is quite a bit more room to store private data necessary for the page pool. When a page's refcount hits zero and it's a networking page, we can feed it back to the page pool instead of the page allocator. From a first look, we should be able to use the PG_owner_priv_1 page flag for network pages (see how this flag is overloaded, we can add a PG_network alias). With this, we can identify the page in __put_page() and __release_page(). These functions are already aware of different types of pages and do their respective cleanup handling. We can similarly make network a first-class citizen and hand pages back to the network allocator from in there. Now the network KNOWS which of its pages are in use and which aren't. You can use that to uncharge pages without the DoS vector, and it'd go a great length toward introspecting and managing this memory - and not sit in a black hole as far as users and the MM are concerned. Thanks, Johannes
On Wed, Mar 17, 2021 at 3:12 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Tue, Mar 16, 2021 at 11:05:11PM -0700, Arjun Roy wrote: > > On Tue, Mar 16, 2021 at 3:27 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > Hello, > > > > > > On Mon, Mar 15, 2021 at 09:16:45PM -0700, Arjun Roy wrote: > > > > From: Arjun Roy <arjunroy@google.com> > > > > > > > > TCP zerocopy receive is used by high performance network applications > > > > to further scale. For RX zerocopy, the memory containing the network > > > > data filled by the network driver is directly mapped into the address > > > > space of high performance applications. To keep the TLB cost low, > > > > these applications unmap the network memory in big batches. So, this > > > > memory can remain mapped for long time. This can cause a memory > > > > isolation issue as this memory becomes unaccounted after getting > > > > mapped into the application address space. This patch adds the memcg > > > > accounting for such memory. > > > > > > > > Accounting the network memory comes with its own unique challenges. > > > > The high performance NIC drivers use page pooling to reuse the pages > > > > to eliminate/reduce expensive setup steps like IOMMU. These drivers > > > > keep an extra reference on the pages and thus we can not depend on the > > > > page reference for the uncharging. The page in the pool may keep a > > > > memcg pinned for arbitrary long time or may get used by other memcg. > > > > > > The page pool knows when a page is unmapped again and becomes > > > available for recycling, right? Essentially the 'free' phase of that > > > private allocator. That's where the uncharge should be done. > > > > > > > In general, no it does not. The page pool, how it operates and whether it > > exists in the first place, is an optimization that a given NIC driver can choose > > to make - and so there's no generic plumbing that ties page unmap events to > > something that a page pool could subscribe to that I am aware of. All it can do > > is check, at a given point, whether it can reuse a page or not, typically by > > checking the current page refcount. > > > > A couple of examples for drivers with such a mechanism - mlx5: > > (https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L248) > > if (page_ref_count(cache->page_cache[cache->head].page) != 1) > > So IIUC it co-opts the page count used by the page allocator, offset > by the base ref of the pool. That means it doesn't control the put > path and won't be aware of when pages are used and unused. > > How does it free those pages back to the system eventually? Does it > just do a series of put_page() on pages in the pool when something > determines it is too big? > > However it does it, it found some way to live without a > destructor. But now we need one. > > > Or intel fm10k: > > (https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/intel/fm10k/fm10k_main.c#L207) > > > > Note that typically map count is not checked (maybe because page-flipping > > receive zerocopy did not exist as a consideration when the driver was written). > > > > So given that the page pool is essentially checking on demand for whether a page > > is usable or not - since there is no specific plumbing invoked when a page is > > usable again (i.e. unmapped, in this case) - we opted to hook into when the > > mapcount is decremented inside unmap() path. > > The problem is that the page isn't reusable just because it's > unmapped. The user may have vmspliced those pages into a pipe and be > pinning them long after the unmap. > > I don't know whether that happens in real life, but it's a legitimate > thing to do. At the very least it would be an avenue for abuse. > > > > For one, it's more aligned with the usual memcg charge lifetime rules. > > > > > > But also it doesn't add what is essentially a private driver callback > > > to the generic file unmapping path. > > > > > > > I understand the concern, and share it - the specific thing we'd like to avoid > > is to have driver specific code in the unmap path, and not in the least because > > any given driver could do its own thing. > > > > Rather, we consider this mechanism that we added as generic to zerocopy network > > data reception - that it does the right thing, no matter what the driver is > > doing. This would be transparent to the driver, in other words - all the driver > > has to do is to continue doing what it was before, using page->refcnt == 1 to > > decide whether it can use a page or if it is not already in use. > > > > > > Consider this instead as a broadly applicable networking feature adding a > > callback to the unmap path, instead of a particular driver. And while it is just > > TCP at present, it fundamentally isn't limited to TCP. > > > > I do have a request for clarification, if you could specify the usual memcg > > charge lifetime rules that you are referring to here? Just to make sure we're on > > the same page. > > Usually, the charge lifetime is tied to the allocation lifetime. It > ends when the object is fed back to whatever pool it came out of, its > data is considered invalid, and its memory is available to new allocs. > > This isn't the case here, and as per above you may uncharge the page > long before the user actually relinquishes it. > > > > 1. The NR_FILE_MAPPED accounting. It is longstanding Linux behavior > > > that driver pages mapped into userspace are accounted as file > > > pages, because userspace is actually doing mmap() against a driver > > > file/fd (as opposed to an anon mmap). That is how they show up in > > > vmstat, in meminfo, and in the per process stats. There is no > > > reason to make memcg deviate from this. If we don't like it, it > > > should be taken on by changing vm_insert_page() - not trick rmap > > > into thinking these arent memcg pages and then fixing it up with > > > additional special-cased accounting callbacks. > > > > > > v1 did this right, it charged the pages the way we handle all other > > > userspace pages: before rmap, and then let the generic VM code do > > > the accounting for us with the cgroup-aware vmstat infrastructure. > > > > To clarify, are you referring to the v1 approach for this patch from a > > few weeks ago? > > Yes. > > > (i.e. charging for the page before vm_insert_page()). This patch changes when > > the charging happens, and, as you note, makes it a forced charge since we've > > already inserted the mappings into the user's address space - but it isn't > > otherwise fundamentally different from v1 in what it does. And unmap is the > > same. > > Right. The places where it IS different are the problem ;) > > Working around native VM accounting; adding custom accounting that is > inconsistent with the rest of the system; force-charging a quantity of > memory that the container may not be entitled to. > > Please revert back to precharging like in v1. > v3 will do so. > > The period of double counting in v1 of this patch was from around the time we do > > vm_insert_page() (note that the pages were accounted just prior to being > > inserted) till the struct sk_buff's were disposed of - for an skb > > that's up to 45 pages. > > That's seems fine. > > Can there be instances where the buffers are pinned by something else > for indefinite lengths of time? > > > But note that is for one socket, and there can be quite a lot of open > > sockets and > > depending on what happens in terms of scheduling the period of time we're > > double counting can be a bit high. > > You mean thread/CPU scheduling? > > If it comes down to that, I'm not convinced this is a practical > concern at this point. > > I think we can assume that the number of sockets and the number of > concurrent threads going through the receive path at any given time > will scale with the size of the cgroup. And even a thousand threads > reading from a thousand sockets at exactly the same time would double > charge a maximum of 175M for a brief period of time. Since few people > have 1,000 CPUs the possible overlap is further reduced. > > This isn't going to be a problem in the real world. > > > > The way I see it, any conflict here is caused by the pages being > > > counted in the SOCK counter already, but not actually *tracked* on > > > a per page basis. If it's worth addressing, we should look into > > > fixing the root cause over there first if possible, before trying > > > to work around it here. > > > > > > > When you say tracked on a per-page basis, I assume you mean using the usual > > mechanism where a page has a non-null memcg set, with unaccounting occuring when > > the refcount goes to 0. > > Right. > > > Networking currently will account/unaccount bytes just based on a > > page count (and the memcg set in struct sock) rather than setting it in the page > > itself - because the recycling of pages means the next time around it could be > > charged to another memcg. And the refcount never goes to 1 due to the pooling > > (in the absence of eviction for busy pages during packet reception). When > > sitting in the driver page pool, non-null memcg does not work since we do not > > know which socket (thus which memcg) the page would be destined for since we do > > not know whose packet arrives next. > > > > The page pooling does make this all this a bit awkward, and the approach > > in this patch seems to me the cleanest solution. > > I don't think it's clean, but as per above it also isn't complete. > > What's necessary is to give the network page pool a hookup to the > page's put path so it knows when pages go unused. > > This would actually be great not just for this patch, but also for > accounting the amount of memory consumed by the network stack in > general, at the system level. This can be a sizable chunk these days, > especially with some of the higher end nics. Users currently have no > idea where their memory is going. And it seems it couldn't even answer > this question right now without scanning the entire pool. It'd be > great if it could track used and cached pages and report to vmstat. > > It would also be good if it could integrate with the page reclaim path > and return any unused pages when the system is struggling with memory > pressure. I don't see how it could easily/predictably do that without > a good handle on what is and isn't used. > > These are all upsides even before your patch. Let's stop working > around this quirk in the page pool, we've clearly run into the limit > of this implementation. > > > Here is an idea of how it could work: > > struct page already has > > struct { /* page_pool used by netstack */ > /** > * @dma_addr: might require a 64-bit value even on > * 32-bit architectures. > */ > dma_addr_t dma_addr; > }; > > and as you can see from its union neighbors, there is quite a bit more > room to store private data necessary for the page pool. > > When a page's refcount hits zero and it's a networking page, we can > feed it back to the page pool instead of the page allocator. > > From a first look, we should be able to use the PG_owner_priv_1 page > flag for network pages (see how this flag is overloaded, we can add a > PG_network alias). With this, we can identify the page in __put_page() > and __release_page(). These functions are already aware of different > types of pages and do their respective cleanup handling. We can > similarly make network a first-class citizen and hand pages back to > the network allocator from in there. > > Now the network KNOWS which of its pages are in use and which > aren't. You can use that to uncharge pages without the DoS vector, and > it'd go a great length toward introspecting and managing this memory - > and not sit in a black hole as far as users and the MM are concerned. > To make sure we're on the same page, then, here's a tentative mechanism - I'd rather get buy in before spending too much time on something that wouldn't pass muster afterwards. A) An opt-in mechanism, that a driver needs to explicitly support, in order to get properly accounted receive zerocopy. B) Failure to opt-in (e.g. unchanged old driver) can either lead to unaccounted zerocopy (ie. existing behaviour) or, optionally, effectively disabled zerocopy (ie. any call to zerocopy will return something like EINVAL) (perhaps controlled via some sysctl, which either lets zerocopy through or not with/without accounting). The proposed mechanism would involve: 1) Some way of marking a page as being allocated by a driver that has decided to opt into this mechanism. Say, a page flag, or a memcg flag. 2) A callback provided by the driver, that takes a struct page*, and returns a boolean. The value of the boolean being true indicates that any and all refs on the page are held by the driver. False means there exists at least one reference that is not held by the driver. 3) A branch in put_page() that, for pages marked thus, will consult the driver callback and if it returns true, will uncharge the memcg for the page. The anonymous struct you defined above is part of a union that I think normally is one qword in length (well, could be more depending on the typedefs I saw there) and I think that can be co-opted to provide the driver callback - though, it might require growing the struct by one more qword since there may be drivers like mlx5 that are already using the field already in there for dma_addr. Anyways, the callback could then be used by the driver to handle the other accounting quirks you mentioned, without needing to scan the full pool. Of course there are corner cases and such to properly account for, but I just wanted to provide a really rough sketch to see if this (assuming it were properly implemented) was what you had in mind. If so I can put together a v3 patch. Per my response to Andrew earlier, this would make it even more confusing whether this is to be applied against net-next or mm trees. But that's a bridge to cross when we get to it. What are your thoughts? Thanks, -Arjun > Thanks, > Johannes
On Wed 17-03-21 18:12:55, Johannes Weiner wrote: [...] > Here is an idea of how it could work: > > struct page already has > > struct { /* page_pool used by netstack */ > /** > * @dma_addr: might require a 64-bit value even on > * 32-bit architectures. > */ > dma_addr_t dma_addr; > }; > > and as you can see from its union neighbors, there is quite a bit more > room to store private data necessary for the page pool. > > When a page's refcount hits zero and it's a networking page, we can > feed it back to the page pool instead of the page allocator. > > From a first look, we should be able to use the PG_owner_priv_1 page > flag for network pages (see how this flag is overloaded, we can add a > PG_network alias). With this, we can identify the page in __put_page() > and __release_page(). These functions are already aware of different > types of pages and do their respective cleanup handling. We can > similarly make network a first-class citizen and hand pages back to > the network allocator from in there. For compound pages we have a concept of destructors. Maybe we can extend that for order-0 pages as well. The struct page is heavily packed and compound_dtor shares the storage without other metadata int pages; /* 16 4 */ unsigned char compound_dtor; /* 16 1 */ atomic_t hpage_pinned_refcount; /* 16 4 */ pgtable_t pmd_huge_pte; /* 16 8 */ void * zone_device_data; /* 16 8 */ But none of those should really require to be valid when a page is freed unless I am missing something. It would really require to check their users whether they can leave the state behind. But if we can establish a contract that compound_dtor can be always valid when a page is freed this would be really a nice and useful abstraction because you wouldn't have to care about the specific type of page. But maybe I am just overlooking the real complexity there.
On Mon, Mar 22, 2021 at 02:35:11PM -0700, Arjun Roy wrote: > To make sure we're on the same page, then, here's a tentative > mechanism - I'd rather get buy in before spending too much time on > something that wouldn't pass muster afterwards. > > A) An opt-in mechanism, that a driver needs to explicitly support, in > order to get properly accounted receive zerocopy. Yep, opt-in makes sense. That allows piece-by-piece conversion and avoids us having to have a flag day. > B) Failure to opt-in (e.g. unchanged old driver) can either lead to > unaccounted zerocopy (ie. existing behaviour) or, optionally, > effectively disabled zerocopy (ie. any call to zerocopy will return > something like EINVAL) (perhaps controlled via some sysctl, which > either lets zerocopy through or not with/without accounting). I'd suggest letting it fail gracefully (i.e. no -EINVAL) to not disturb existing/working setups during the transition period. But the exact policy is easy to change later on if we change our minds on it. > The proposed mechanism would involve: > 1) Some way of marking a page as being allocated by a driver that has > decided to opt into this mechanism. Say, a page flag, or a memcg flag. Right. I would stress it should not be a memcg flag or any direct channel from the network to memcg, as this would limit its usefulness while having the same maintenance overhead. It should make the network page a first class MM citizen - like an LRU page or a slab page - which can be accounted and introspected as such, including from the memcg side. So definitely a page flag. > 2) A callback provided by the driver, that takes a struct page*, and > returns a boolean. The value of the boolean being true indicates that > any and all refs on the page are held by the driver. False means there > exists at least one reference that is not held by the driver. I was thinking the PageNetwork flag would cover this, but maybe I'm missing something? > 3) A branch in put_page() that, for pages marked thus, will consult > the driver callback and if it returns true, will uncharge the memcg > for the page. The way I picture it, put_page() (and release_pages) should do this: void __put_page(struct page *page) { if (is_zone_device_page(page)) { put_dev_pagemap(page->pgmap); /* * The page belongs to the device that created pgmap. Do * not return it to page allocator. */ return; } + + if (PageNetwork(page)) { + put_page_network(page); + /* Page belongs to the network stack, not the page allocator */ + return; + } if (unlikely(PageCompound(page))) __put_compound_page(page); else __put_single_page(page); } where put_page_network() is the network-side callback that uncharges the page. (..and later can be extended to do all kinds of things when informed that the page has been freed: update statistics (mod_page_state), put it on a private network freelist, or ClearPageNetwork() and give it back to the page allocator etc. But for starters it can set_page_count(page, 1) after the uncharge to retain the current silent recycling behavior.) > The anonymous struct you defined above is part of a union that I think > normally is one qword in length (well, could be more depending on the > typedefs I saw there) and I think that can be co-opted to provide the > driver callback - though, it might require growing the struct by one > more qword since there may be drivers like mlx5 that are already using > the field already in there for dma_addr. The page cache / anonymous struct it's shared with is 5 words (double linked list pointers, mapping, index, private), and the network struct is currently one word, so you can add 4 words to a PageNetwork() page without increasing the size of struct page. That should be plenty of space to store auxiliary data for drivers, right? > Anyways, the callback could then be used by the driver to handle the > other accounting quirks you mentioned, without needing to scan the > full pool. Right. > Of course there are corner cases and such to properly account for, but > I just wanted to provide a really rough sketch to see if this > (assuming it were properly implemented) was what you had in mind. If > so I can put together a v3 patch. Yeah, makes perfect sense. We can keep iterating like this any time you feel you accumulate too many open questions. Not just for MM but also for the networking folks - although I suspect that the first step would be mostly about the MM infrastructure, and I'm not sure how much they care about the internals there ;) > Per my response to Andrew earlier, this would make it even more > confusing whether this is to be applied against net-next or mm trees. > But that's a bridge to cross when we get to it. The mm tree includes -next, so it should be a safe development target for the time being. I would then decide it based on how many changes your patch interacts with on either side. Changes to struct page and the put path are not very frequent, so I suspect it'll be easy to rebase to net-next and route everything through there. And if there are heavy changes on both sides, the -mm tree is the better route anyway. Does that sound reasonable?
On Tue, Mar 23, 2021 at 10:01 AM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Mon, Mar 22, 2021 at 02:35:11PM -0700, Arjun Roy wrote: > > To make sure we're on the same page, then, here's a tentative > > mechanism - I'd rather get buy in before spending too much time on > > something that wouldn't pass muster afterwards. > > > > A) An opt-in mechanism, that a driver needs to explicitly support, in > > order to get properly accounted receive zerocopy. > > Yep, opt-in makes sense. That allows piece-by-piece conversion and > avoids us having to have a flag day. > > > B) Failure to opt-in (e.g. unchanged old driver) can either lead to > > unaccounted zerocopy (ie. existing behaviour) or, optionally, > > effectively disabled zerocopy (ie. any call to zerocopy will return > > something like EINVAL) (perhaps controlled via some sysctl, which > > either lets zerocopy through or not with/without accounting). > > I'd suggest letting it fail gracefully (i.e. no -EINVAL) to not > disturb existing/working setups during the transition period. But the > exact policy is easy to change later on if we change our minds on it. > > > The proposed mechanism would involve: > > 1) Some way of marking a page as being allocated by a driver that has > > decided to opt into this mechanism. Say, a page flag, or a memcg flag. > > Right. I would stress it should not be a memcg flag or any direct > channel from the network to memcg, as this would limit its usefulness > while having the same maintenance overhead. > > It should make the network page a first class MM citizen - like an LRU > page or a slab page - which can be accounted and introspected as such, > including from the memcg side. > > So definitely a page flag. Works for me. > > > 2) A callback provided by the driver, that takes a struct page*, and > > returns a boolean. The value of the boolean being true indicates that > > any and all refs on the page are held by the driver. False means there > > exists at least one reference that is not held by the driver. > > I was thinking the PageNetwork flag would cover this, but maybe I'm > missing something? > The main reason for a driver callback is to handle whatever driver-specific behaviour needs to be handled (ie. while a driver may use code from net/core/page_pool.c, it also may roll its own arbitrary behaviour and data structures). And because it's not necessarily the case that a driver would take exactly 1 ref of its own on the page. > > 3) A branch in put_page() that, for pages marked thus, will consult > > the driver callback and if it returns true, will uncharge the memcg > > for the page. > > The way I picture it, put_page() (and release_pages) should do this: > > void __put_page(struct page *page) > { > if (is_zone_device_page(page)) { > put_dev_pagemap(page->pgmap); > > /* > * The page belongs to the device that created pgmap. Do > * not return it to page allocator. > */ > return; > } > + > + if (PageNetwork(page)) { > + put_page_network(page); > + /* Page belongs to the network stack, not the page allocator */ > + return; > + } > > if (unlikely(PageCompound(page))) > __put_compound_page(page); > else > __put_single_page(page); > } > > where put_page_network() is the network-side callback that uncharges > the page. > > (..and later can be extended to do all kinds of things when informed > that the page has been freed: update statistics (mod_page_state), put > it on a private network freelist, or ClearPageNetwork() and give it > back to the page allocator etc. > Yes, this is more or less what I had in mind, though put_page_network() would also need to avail itself of the callback mentioned previously. > But for starters it can set_page_count(page, 1) after the uncharge to > retain the current silent recycling behavior.) > This would be one example of where the driver could conceivably have >1 ref for whatever reason (https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx4/en_rx.c#L495) where it looks like it could take 2 refs on a page, perhaps storing 2 x 1500B packets on a single 4KB page. > > The anonymous struct you defined above is part of a union that I think > > normally is one qword in length (well, could be more depending on the > > typedefs I saw there) and I think that can be co-opted to provide the > > driver callback - though, it might require growing the struct by one > > more qword since there may be drivers like mlx5 that are already using > > the field already in there for dma_addr. > > The page cache / anonymous struct it's shared with is 5 words (double > linked list pointers, mapping, index, private), and the network struct > is currently one word, so you can add 4 words to a PageNetwork() page > without increasing the size of struct page. That should be plenty of > space to store auxiliary data for drivers, right? > Ah, I think I was looking more narrowly at an older version of the struct. The new one is much easier to parse. :) 4 words should be plenty, I think. > > Anyways, the callback could then be used by the driver to handle the > > other accounting quirks you mentioned, without needing to scan the > > full pool. > > Right. > > > Of course there are corner cases and such to properly account for, but > > I just wanted to provide a really rough sketch to see if this > > (assuming it were properly implemented) was what you had in mind. If > > so I can put together a v3 patch. > > Yeah, makes perfect sense. We can keep iterating like this any time > you feel you accumulate too many open questions. Not just for MM but > also for the networking folks - although I suspect that the first step > would be mostly about the MM infrastructure, and I'm not sure how much > they care about the internals there ;) > > > Per my response to Andrew earlier, this would make it even more > > confusing whether this is to be applied against net-next or mm trees. > > But that's a bridge to cross when we get to it. > > The mm tree includes -next, so it should be a safe development target > for the time being. > > I would then decide it based on how many changes your patch interacts > with on either side. Changes to struct page and the put path are not > very frequent, so I suspect it'll be easy to rebase to net-next and > route everything through there. And if there are heavy changes on both > sides, the -mm tree is the better route anyway. > > Does that sound reasonable? This sounds good to me. To summarize then, it seems to me that we're on the same page now. I'll put together a tentative v3 such that: 1. It uses pre-charging, as previously discussed. 2. It uses a page flag to delineate pages of a certain networking sort (ie. this mechanism). 3. It avails itself of up to 4 words of data inside struct page, inside the networking specific struct. 4. And it sets up this opt-in lifecycle notification for drivers that choose to use it, falling back to existing behaviour without. Thanks, -Arjun
On Tue, Mar 23, 2021 at 7:34 AM Michal Hocko <mhocko@suse.com> wrote: > > On Wed 17-03-21 18:12:55, Johannes Weiner wrote: > [...] > > Here is an idea of how it could work: > > > > struct page already has > > > > struct { /* page_pool used by netstack */ > > /** > > * @dma_addr: might require a 64-bit value even on > > * 32-bit architectures. > > */ > > dma_addr_t dma_addr; > > }; > > > > and as you can see from its union neighbors, there is quite a bit more > > room to store private data necessary for the page pool. > > > > When a page's refcount hits zero and it's a networking page, we can > > feed it back to the page pool instead of the page allocator. > > > > From a first look, we should be able to use the PG_owner_priv_1 page > > flag for network pages (see how this flag is overloaded, we can add a > > PG_network alias). With this, we can identify the page in __put_page() > > and __release_page(). These functions are already aware of different > > types of pages and do their respective cleanup handling. We can > > similarly make network a first-class citizen and hand pages back to > > the network allocator from in there. > > For compound pages we have a concept of destructors. Maybe we can extend > that for order-0 pages as well. The struct page is heavily packed and > compound_dtor shares the storage without other metadata > int pages; /* 16 4 */ > unsigned char compound_dtor; /* 16 1 */ > atomic_t hpage_pinned_refcount; /* 16 4 */ > pgtable_t pmd_huge_pte; /* 16 8 */ > void * zone_device_data; /* 16 8 */ > > But none of those should really require to be valid when a page is freed > unless I am missing something. It would really require to check their > users whether they can leave the state behind. But if we can establish a > contract that compound_dtor can be always valid when a page is freed > this would be really a nice and useful abstraction because you wouldn't > have to care about the specific type of page. > > But maybe I am just overlooking the real complexity there. > -- For now probably the easiest way is to have network pages be first class with a specific flag as previously discussed and have concrete handling for it, rather than trying to establish the contract across page types. Thanks, -Arjun > Michal Hocko > SUSE Labs
On Tue 23-03-21 11:47:54, Arjun Roy wrote: > On Tue, Mar 23, 2021 at 7:34 AM Michal Hocko <mhocko@suse.com> wrote: > > > > On Wed 17-03-21 18:12:55, Johannes Weiner wrote: > > [...] > > > Here is an idea of how it could work: > > > > > > struct page already has > > > > > > struct { /* page_pool used by netstack */ > > > /** > > > * @dma_addr: might require a 64-bit value even on > > > * 32-bit architectures. > > > */ > > > dma_addr_t dma_addr; > > > }; > > > > > > and as you can see from its union neighbors, there is quite a bit more > > > room to store private data necessary for the page pool. > > > > > > When a page's refcount hits zero and it's a networking page, we can > > > feed it back to the page pool instead of the page allocator. > > > > > > From a first look, we should be able to use the PG_owner_priv_1 page > > > flag for network pages (see how this flag is overloaded, we can add a > > > PG_network alias). With this, we can identify the page in __put_page() > > > and __release_page(). These functions are already aware of different > > > types of pages and do their respective cleanup handling. We can > > > similarly make network a first-class citizen and hand pages back to > > > the network allocator from in there. > > > > For compound pages we have a concept of destructors. Maybe we can extend > > that for order-0 pages as well. The struct page is heavily packed and > > compound_dtor shares the storage without other metadata > > int pages; /* 16 4 */ > > unsigned char compound_dtor; /* 16 1 */ > > atomic_t hpage_pinned_refcount; /* 16 4 */ > > pgtable_t pmd_huge_pte; /* 16 8 */ > > void * zone_device_data; /* 16 8 */ > > > > But none of those should really require to be valid when a page is freed > > unless I am missing something. It would really require to check their > > users whether they can leave the state behind. But if we can establish a > > contract that compound_dtor can be always valid when a page is freed > > this would be really a nice and useful abstraction because you wouldn't > > have to care about the specific type of page. > > > > But maybe I am just overlooking the real complexity there. > > -- > > For now probably the easiest way is to have network pages be first > class with a specific flag as previously discussed and have concrete > handling for it, rather than trying to establish the contract across > page types. If you are going to claim a page flag then it would be much better to have it more generic. Flags are really scarce and if all you care about is PageHasDestructor() and provide one via page->dtor then the similar mechanism can be reused by somebody else. Or does anything prevent that?
On Tue, Mar 23, 2021 at 11:42 AM Arjun Roy <arjunroy@google.com> wrote: > [...] > > To summarize then, it seems to me that we're on the same page now. > I'll put together a tentative v3 such that: > 1. It uses pre-charging, as previously discussed. > 2. It uses a page flag to delineate pages of a certain networking sort > (ie. this mechanism). > 3. It avails itself of up to 4 words of data inside struct page, > inside the networking specific struct. > 4. And it sets up this opt-in lifecycle notification for drivers that > choose to use it, falling back to existing behaviour without. > Arjun, if you don't mind, can you explain how the lifetime of such a page will look like? For example: Driver: page = dev_alloc_page() /* page has 1 ref */ dev_map_page(page) /* I don't think dev_map_page() takes a ref on page, so the ref remains 1. */ On incoming traffic the page goes to skb and which then gets assigned to a struct sock. Does the kernel increase refcnt of the page on these operations? The page gets mapped into user space which increments its refcnt. After processing the data, the application unmaps the page and its refcnt will be decremented. __put_page() will be called when refcnt reaches 0, so, the initial refcnt which the driver has acquired, has to be transferred to the next layer. So, I am trying to understand how that will work?
On Wed, Mar 24, 2021 at 2:12 AM Michal Hocko <mhocko@suse.com> wrote: > > On Tue 23-03-21 11:47:54, Arjun Roy wrote: > > On Tue, Mar 23, 2021 at 7:34 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > On Wed 17-03-21 18:12:55, Johannes Weiner wrote: > > > [...] > > > > Here is an idea of how it could work: > > > > > > > > struct page already has > > > > > > > > struct { /* page_pool used by netstack */ > > > > /** > > > > * @dma_addr: might require a 64-bit value even on > > > > * 32-bit architectures. > > > > */ > > > > dma_addr_t dma_addr; > > > > }; > > > > > > > > and as you can see from its union neighbors, there is quite a bit more > > > > room to store private data necessary for the page pool. > > > > > > > > When a page's refcount hits zero and it's a networking page, we can > > > > feed it back to the page pool instead of the page allocator. > > > > > > > > From a first look, we should be able to use the PG_owner_priv_1 page > > > > flag for network pages (see how this flag is overloaded, we can add a > > > > PG_network alias). With this, we can identify the page in __put_page() > > > > and __release_page(). These functions are already aware of different > > > > types of pages and do their respective cleanup handling. We can > > > > similarly make network a first-class citizen and hand pages back to > > > > the network allocator from in there. > > > > > > For compound pages we have a concept of destructors. Maybe we can extend > > > that for order-0 pages as well. The struct page is heavily packed and > > > compound_dtor shares the storage without other metadata > > > int pages; /* 16 4 */ > > > unsigned char compound_dtor; /* 16 1 */ > > > atomic_t hpage_pinned_refcount; /* 16 4 */ > > > pgtable_t pmd_huge_pte; /* 16 8 */ > > > void * zone_device_data; /* 16 8 */ > > > > > > But none of those should really require to be valid when a page is freed > > > unless I am missing something. It would really require to check their > > > users whether they can leave the state behind. But if we can establish a > > > contract that compound_dtor can be always valid when a page is freed > > > this would be really a nice and useful abstraction because you wouldn't > > > have to care about the specific type of page. > > > > > > But maybe I am just overlooking the real complexity there. > > > -- > > > > For now probably the easiest way is to have network pages be first > > class with a specific flag as previously discussed and have concrete > > handling for it, rather than trying to establish the contract across > > page types. > > If you are going to claim a page flag then it would be much better to > have it more generic. Flags are really scarce and if all you care about > is PageHasDestructor() and provide one via page->dtor then the similar > mechanism can be reused by somebody else. Or does anything prevent that? The way I see it - the fundamental want here is, for some arbitrary page that we are dropping a reference on, to be able to tell that the provenance of the page is some network driver's page pool. If we added an enum target to compound_dtor, if we examine that offset in the page and look at that value, what guarantee do we have that the page isn't instead some other kind of page, and the byte value there was just coincidentally the one we were looking for (but it wasn't a network driver pool page)? Existing users of compound_dtor seem to check first that a PageCompound() or PageHead() return true - the specific scenario here, of receiving network packets, those pages will tend to not be compound (and more specifically, compound pages are explicitly disallowed for TCP receive zerocopy). Given that's the case, the options seem to be: 1) Use a page flag - with the downside that they are a severely limited resource, 2) Use some bits inside page->memcg_data - this I believe Johannes had reasons against, and it isn't always the case that MEMCG support is enabled. 3) Use compound_dtor - but I think this would have problems for the prior reasons. Thanks, -Arjun > -- > Michal Hocko > SUSE Labs
On Wed, Mar 24, 2021 at 1:39 PM Arjun Roy <arjunroy@google.com> wrote: > > On Wed, Mar 24, 2021 at 2:12 AM Michal Hocko <mhocko@suse.com> wrote: > > > > On Tue 23-03-21 11:47:54, Arjun Roy wrote: > > > On Tue, Mar 23, 2021 at 7:34 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > On Wed 17-03-21 18:12:55, Johannes Weiner wrote: > > > > [...] > > > > > Here is an idea of how it could work: > > > > > > > > > > struct page already has > > > > > > > > > > struct { /* page_pool used by netstack */ > > > > > /** > > > > > * @dma_addr: might require a 64-bit value even on > > > > > * 32-bit architectures. > > > > > */ > > > > > dma_addr_t dma_addr; > > > > > }; > > > > > > > > > > and as you can see from its union neighbors, there is quite a bit more > > > > > room to store private data necessary for the page pool. > > > > > > > > > > When a page's refcount hits zero and it's a networking page, we can > > > > > feed it back to the page pool instead of the page allocator. > > > > > > > > > > From a first look, we should be able to use the PG_owner_priv_1 page > > > > > flag for network pages (see how this flag is overloaded, we can add a > > > > > PG_network alias). With this, we can identify the page in __put_page() > > > > > and __release_page(). These functions are already aware of different > > > > > types of pages and do their respective cleanup handling. We can > > > > > similarly make network a first-class citizen and hand pages back to > > > > > the network allocator from in there. > > > > > > > > For compound pages we have a concept of destructors. Maybe we can extend > > > > that for order-0 pages as well. The struct page is heavily packed and > > > > compound_dtor shares the storage without other metadata > > > > int pages; /* 16 4 */ > > > > unsigned char compound_dtor; /* 16 1 */ > > > > atomic_t hpage_pinned_refcount; /* 16 4 */ > > > > pgtable_t pmd_huge_pte; /* 16 8 */ > > > > void * zone_device_data; /* 16 8 */ > > > > > > > > But none of those should really require to be valid when a page is freed > > > > unless I am missing something. It would really require to check their > > > > users whether they can leave the state behind. But if we can establish a > > > > contract that compound_dtor can be always valid when a page is freed > > > > this would be really a nice and useful abstraction because you wouldn't > > > > have to care about the specific type of page. > > > > > > > > But maybe I am just overlooking the real complexity there. > > > > -- > > > > > > For now probably the easiest way is to have network pages be first > > > class with a specific flag as previously discussed and have concrete > > > handling for it, rather than trying to establish the contract across > > > page types. > > > > If you are going to claim a page flag then it would be much better to > > have it more generic. Flags are really scarce and if all you care about > > is PageHasDestructor() and provide one via page->dtor then the similar > > mechanism can be reused by somebody else. Or does anything prevent that? > > The way I see it - the fundamental want here is, for some arbitrary > page that we are dropping a reference on, to be able to tell that the > provenance of the page is some network driver's page pool. If we added > an enum target to compound_dtor, if we examine that offset in the page > and look at that value, what guarantee do we have that the page isn't > instead some other kind of page, and the byte value there was just > coincidentally the one we were looking for (but it wasn't a network > driver pool page)? > > Existing users of compound_dtor seem to check first that a > PageCompound() or PageHead() return true - the specific scenario here, > of receiving network packets, those pages will tend to not be compound > (and more specifically, compound pages are explicitly disallowed for > TCP receive zerocopy). > > Given that's the case, the options seem to be: > 1) Use a page flag - with the downside that they are a severely > limited resource, > 2) Use some bits inside page->memcg_data - this I believe Johannes had > reasons against, and it isn't always the case that MEMCG support is > enabled. > 3) Use compound_dtor - but I think this would have problems for the > prior reasons. I don't think Michal is suggesting to use PageCompound() or PageHead(). He is suggesting to add a more general page flag (PageHasDestructor) and corresponding page->dtor, so other potential users can use it too.
On Wed, Mar 24, 2021 at 10:12:46AM +0100, Michal Hocko wrote: > On Tue 23-03-21 11:47:54, Arjun Roy wrote: > > On Tue, Mar 23, 2021 at 7:34 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > On Wed 17-03-21 18:12:55, Johannes Weiner wrote: > > > [...] > > > > Here is an idea of how it could work: > > > > > > > > struct page already has > > > > > > > > struct { /* page_pool used by netstack */ > > > > /** > > > > * @dma_addr: might require a 64-bit value even on > > > > * 32-bit architectures. > > > > */ > > > > dma_addr_t dma_addr; > > > > }; > > > > > > > > and as you can see from its union neighbors, there is quite a bit more > > > > room to store private data necessary for the page pool. > > > > > > > > When a page's refcount hits zero and it's a networking page, we can > > > > feed it back to the page pool instead of the page allocator. > > > > > > > > From a first look, we should be able to use the PG_owner_priv_1 page > > > > flag for network pages (see how this flag is overloaded, we can add a > > > > PG_network alias). With this, we can identify the page in __put_page() > > > > and __release_page(). These functions are already aware of different > > > > types of pages and do their respective cleanup handling. We can > > > > similarly make network a first-class citizen and hand pages back to > > > > the network allocator from in there. > > > > > > For compound pages we have a concept of destructors. Maybe we can extend > > > that for order-0 pages as well. The struct page is heavily packed and > > > compound_dtor shares the storage without other metadata > > > int pages; /* 16 4 */ > > > unsigned char compound_dtor; /* 16 1 */ > > > atomic_t hpage_pinned_refcount; /* 16 4 */ > > > pgtable_t pmd_huge_pte; /* 16 8 */ > > > void * zone_device_data; /* 16 8 */ > > > > > > But none of those should really require to be valid when a page is freed > > > unless I am missing something. It would really require to check their > > > users whether they can leave the state behind. But if we can establish a > > > contract that compound_dtor can be always valid when a page is freed > > > this would be really a nice and useful abstraction because you wouldn't > > > have to care about the specific type of page. Yeah technically nobody should leave these fields behind, but it sounds pretty awkward to manage an overloaded destructor with a refcounted object: Either every put would have to check ref==1 before to see if it will be the one to free the page, and then set up the destructor before putting the final ref. But that means we can't support lockless tryget() schemes like we have in the page cache with a destructor. Or you'd have to set up the destructor every time an overloaded field reverts to its null state, e.g. hpage_pinned_refcount goes back to 0. Neither of those sound practical to me. > > > But maybe I am just overlooking the real complexity there. > > > -- > > > > For now probably the easiest way is to have network pages be first > > class with a specific flag as previously discussed and have concrete > > handling for it, rather than trying to establish the contract across > > page types. > > If you are going to claim a page flag then it would be much better to > have it more generic. Flags are really scarce and if all you care about > is PageHasDestructor() and provide one via page->dtor then the similar > mechanism can be reused by somebody else. Or does anything prevent that? I was suggesting to alias PG_owner_priv_1, which currently isn't used on network pages. We don't need to allocate a brandnew page flag. I agree that a generic destructor for order-0 pages would be nice, but due to the decentralized nature of refcounting the only way I think it would work in practice is by adding a new field to struct page that is not in conflict with any existing ones. Comparably, creating a network type page consumes no additional space.
On Wed 24-03-21 13:53:34, Shakeel Butt wrote: [...] > > Given that's the case, the options seem to be: > > 1) Use a page flag - with the downside that they are a severely > > limited resource, > > 2) Use some bits inside page->memcg_data - this I believe Johannes had > > reasons against, and it isn't always the case that MEMCG support is > > enabled. > > 3) Use compound_dtor - but I think this would have problems for the > > prior reasons. > > I don't think Michal is suggesting to use PageCompound() or > PageHead(). He is suggesting to add a more general page flag > (PageHasDestructor) and corresponding page->dtor, so other potential > users can use it too. Yes, that is eaxactly my point. If there is a page flag to use for a specific destruction then we can use an already existing scheme. I have fully digested Johannes' other reply so I might be still missing something but fundamentally if sombody knows that the particular part of the page is not used (most really should) then the page can claim destructor by a flag and the freeing routine would just call that callback. Or is there any reason why othe subsystems outside of networking couldn't claim their own callback?
On Wed, Mar 24, 2021 at 11:26 AM Shakeel Butt <shakeelb@google.com> wrote: > > On Tue, Mar 23, 2021 at 11:42 AM Arjun Roy <arjunroy@google.com> wrote: > > > [...] > > > > To summarize then, it seems to me that we're on the same page now. > > I'll put together a tentative v3 such that: > > 1. It uses pre-charging, as previously discussed. > > 2. It uses a page flag to delineate pages of a certain networking sort > > (ie. this mechanism). > > 3. It avails itself of up to 4 words of data inside struct page, > > inside the networking specific struct. > > 4. And it sets up this opt-in lifecycle notification for drivers that > > choose to use it, falling back to existing behaviour without. > > > > Arjun, if you don't mind, can you explain how the lifetime of such a > page will look like? > > For example: > > Driver: > page = dev_alloc_page() > /* page has 1 ref */ Yes, this is the case. > dev_map_page(page) > /* I don't think dev_map_page() takes a ref on page, so the ref remains 1. */ > To be clear, do you mean things like DMA setup here? Or specifically what do you mean by dev_map_page? > On incoming traffic the page goes to skb and which then gets assigned > to a struct sock. Does the kernel increase refcnt of the page on these > operations? > Adding a page to an skb will mean that, when the skb is cleaned up, a page ref is dropped: https://github.com/torvalds/linux/blob/master/net/core/skbuff.c#L666 So a driver may bump the refcount for the page, before adding it to the skb: https://github.com/torvalds/linux/blob/master/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c#L442 > The page gets mapped into user space which increments its refcnt. > Yes. > After processing the data, the application unmaps the page and its > refcnt will be decremented. > Yes. > __put_page() will be called when refcnt reaches 0, so, the initial > refcnt which the driver has acquired, has to be transferred to the > next layer. So, I am trying to understand how that will work? Ah, I see - there was a miscommunication. Johannes mentioned __put_page() but I read put_page(). That is where I was planning on adding the interposition for these network pages. So in put_page(), if it turns out it's a network page, we do our handling then as I described in prior emails. Sorry for the confusion. Thanks, -Arjun
On Wed, Mar 24, 2021 at 2:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > On Wed, Mar 24, 2021 at 10:12:46AM +0100, Michal Hocko wrote: > > On Tue 23-03-21 11:47:54, Arjun Roy wrote: > > > On Tue, Mar 23, 2021 at 7:34 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > On Wed 17-03-21 18:12:55, Johannes Weiner wrote: > > > > [...] > > > > > Here is an idea of how it could work: > > > > > > > > > > struct page already has > > > > > > > > > > struct { /* page_pool used by netstack */ > > > > > /** > > > > > * @dma_addr: might require a 64-bit value even on > > > > > * 32-bit architectures. > > > > > */ > > > > > dma_addr_t dma_addr; > > > > > }; > > > > > > > > > > and as you can see from its union neighbors, there is quite a bit more > > > > > room to store private data necessary for the page pool. > > > > > > > > > > When a page's refcount hits zero and it's a networking page, we can > > > > > feed it back to the page pool instead of the page allocator. > > > > > > > > > > From a first look, we should be able to use the PG_owner_priv_1 page > > > > > flag for network pages (see how this flag is overloaded, we can add a > > > > > PG_network alias). With this, we can identify the page in __put_page() > > > > > and __release_page(). These functions are already aware of different > > > > > types of pages and do their respective cleanup handling. We can > > > > > similarly make network a first-class citizen and hand pages back to > > > > > the network allocator from in there. > > > > > > > > For compound pages we have a concept of destructors. Maybe we can extend > > > > that for order-0 pages as well. The struct page is heavily packed and > > > > compound_dtor shares the storage without other metadata > > > > int pages; /* 16 4 */ > > > > unsigned char compound_dtor; /* 16 1 */ > > > > atomic_t hpage_pinned_refcount; /* 16 4 */ > > > > pgtable_t pmd_huge_pte; /* 16 8 */ > > > > void * zone_device_data; /* 16 8 */ > > > > > > > > But none of those should really require to be valid when a page is freed > > > > unless I am missing something. It would really require to check their > > > > users whether they can leave the state behind. But if we can establish a > > > > contract that compound_dtor can be always valid when a page is freed > > > > this would be really a nice and useful abstraction because you wouldn't > > > > have to care about the specific type of page. > > Yeah technically nobody should leave these fields behind, but it > sounds pretty awkward to manage an overloaded destructor with a > refcounted object: > > Either every put would have to check ref==1 before to see if it will > be the one to free the page, and then set up the destructor before > putting the final ref. But that means we can't support lockless > tryget() schemes like we have in the page cache with a destructor. > Ah, I think I see what you were getting at with your prior email - at first I thought your suggestion was that, since the driver may have its own refcount, every put would need to check ref == 1 and call into the driver if need be. Instead, and correct me if I'm wrong, it seems like what you're advocating is: 1) The (opted in) driver no longer hangs onto the ref, 2) Now refcount can go all the way to 0, 3) And when it does, due to the special destructor this page has, it goes back to the driver, rather than the system? > Or you'd have to set up the destructor every time an overloaded field > reverts to its null state, e.g. hpage_pinned_refcount goes back to 0. > > Neither of those sound practical to me. > > > > > But maybe I am just overlooking the real complexity there. > > > > -- > > > > > > For now probably the easiest way is to have network pages be first > > > class with a specific flag as previously discussed and have concrete > > > handling for it, rather than trying to establish the contract across > > > page types. > > > > If you are going to claim a page flag then it would be much better to > > have it more generic. Flags are really scarce and if all you care about > > is PageHasDestructor() and provide one via page->dtor then the similar > > mechanism can be reused by somebody else. Or does anything prevent that? > > I was suggesting to alias PG_owner_priv_1, which currently isn't used > on network pages. We don't need to allocate a brandnew page flag. > Just to be certain, is there any danger of having a page, that would not be a network driver page originally, being inside __put_page(), such that PG_owner_priv_1 is set (but with one of its other overloaded meanings)? Thanks, -Arjun > I agree that a generic destructor for order-0 pages would be nice, but > due to the decentralized nature of refcounting the only way I think it > would work in practice is by adding a new field to struct page that is > not in conflict with any existing ones. > > Comparably, creating a network type page consumes no additional space.
On Wed 24-03-21 15:49:15, Arjun Roy wrote: > On Wed, Mar 24, 2021 at 2:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > On Wed, Mar 24, 2021 at 10:12:46AM +0100, Michal Hocko wrote: > > > On Tue 23-03-21 11:47:54, Arjun Roy wrote: > > > > On Tue, Mar 23, 2021 at 7:34 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > > > On Wed 17-03-21 18:12:55, Johannes Weiner wrote: > > > > > [...] > > > > > > Here is an idea of how it could work: > > > > > > > > > > > > struct page already has > > > > > > > > > > > > struct { /* page_pool used by netstack */ > > > > > > /** > > > > > > * @dma_addr: might require a 64-bit value even on > > > > > > * 32-bit architectures. > > > > > > */ > > > > > > dma_addr_t dma_addr; > > > > > > }; > > > > > > > > > > > > and as you can see from its union neighbors, there is quite a bit more > > > > > > room to store private data necessary for the page pool. > > > > > > > > > > > > When a page's refcount hits zero and it's a networking page, we can > > > > > > feed it back to the page pool instead of the page allocator. > > > > > > > > > > > > From a first look, we should be able to use the PG_owner_priv_1 page > > > > > > flag for network pages (see how this flag is overloaded, we can add a > > > > > > PG_network alias). With this, we can identify the page in __put_page() > > > > > > and __release_page(). These functions are already aware of different > > > > > > types of pages and do their respective cleanup handling. We can > > > > > > similarly make network a first-class citizen and hand pages back to > > > > > > the network allocator from in there. > > > > > > > > > > For compound pages we have a concept of destructors. Maybe we can extend > > > > > that for order-0 pages as well. The struct page is heavily packed and > > > > > compound_dtor shares the storage without other metadata > > > > > int pages; /* 16 4 */ > > > > > unsigned char compound_dtor; /* 16 1 */ > > > > > atomic_t hpage_pinned_refcount; /* 16 4 */ > > > > > pgtable_t pmd_huge_pte; /* 16 8 */ > > > > > void * zone_device_data; /* 16 8 */ > > > > > > > > > > But none of those should really require to be valid when a page is freed > > > > > unless I am missing something. It would really require to check their > > > > > users whether they can leave the state behind. But if we can establish a > > > > > contract that compound_dtor can be always valid when a page is freed > > > > > this would be really a nice and useful abstraction because you wouldn't > > > > > have to care about the specific type of page. > > > > Yeah technically nobody should leave these fields behind, but it > > sounds pretty awkward to manage an overloaded destructor with a > > refcounted object: > > > > Either every put would have to check ref==1 before to see if it will > > be the one to free the page, and then set up the destructor before > > putting the final ref. But that means we can't support lockless > > tryget() schemes like we have in the page cache with a destructor. I do not follow the ref==1 part. I mean to use the hugetlb model where the destructore is configured for the whole lifetime until the page is freed back to the allocator (see below). > Ah, I think I see what you were getting at with your prior email - at > first I thought your suggestion was that, since the driver may have > its own refcount, every put would need to check ref == 1 and call into > the driver if need be. > > Instead, and correct me if I'm wrong, it seems like what you're advocating is: > 1) The (opted in) driver no longer hangs onto the ref, > 2) Now refcount can go all the way to 0, > 3) And when it does, due to the special destructor this page has, it > goes back to the driver, rather than the system? Yes, this is the model we use for hugetlb pages for example. Have a look at free_huge_page path (HUGETLB_PAGE_DTOR destructor). It can either enqueue a freed page to its pool or to the page allocator depending on the pool state. [...] > > > If you are going to claim a page flag then it would be much better to > > > have it more generic. Flags are really scarce and if all you care about > > > is PageHasDestructor() and provide one via page->dtor then the similar > > > mechanism can be reused by somebody else. Or does anything prevent that? > > > > I was suggesting to alias PG_owner_priv_1, which currently isn't used > > on network pages. We don't need to allocate a brandnew page flag. > > > > Just to be certain, is there any danger of having a page, that would > not be a network driver page originally, being inside __put_page(), > such that PG_owner_priv_1 is set (but with one of its other overloaded > meanings)? Yeah this is a real question. And from what Johannes is saying this might pose some problem for this specific flags. I still need to check all the users to be certain. One thing is clear though. PG_owner_priv_1 is not a part of PAGE_FLAGS_CHECK_AT_FREE so it is not checked when a page is freed so it is possible that the flag is left behind when somebody does final put_page which makes things harder. But that would be the case even when the flag is used for network specific handling because you cannot tell it from other potential users who are outside of networking...
On Thu, Mar 25, 2021 at 10:02:28AM +0100, Michal Hocko wrote: > On Wed 24-03-21 15:49:15, Arjun Roy wrote: > > On Wed, Mar 24, 2021 at 2:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > On Wed, Mar 24, 2021 at 10:12:46AM +0100, Michal Hocko wrote: > > > > On Tue 23-03-21 11:47:54, Arjun Roy wrote: > > > > > On Tue, Mar 23, 2021 at 7:34 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > > > > > On Wed 17-03-21 18:12:55, Johannes Weiner wrote: > > > > > > [...] > > > > > > > Here is an idea of how it could work: > > > > > > > > > > > > > > struct page already has > > > > > > > > > > > > > > struct { /* page_pool used by netstack */ > > > > > > > /** > > > > > > > * @dma_addr: might require a 64-bit value even on > > > > > > > * 32-bit architectures. > > > > > > > */ > > > > > > > dma_addr_t dma_addr; > > > > > > > }; > > > > > > > > > > > > > > and as you can see from its union neighbors, there is quite a bit more > > > > > > > room to store private data necessary for the page pool. > > > > > > > > > > > > > > When a page's refcount hits zero and it's a networking page, we can > > > > > > > feed it back to the page pool instead of the page allocator. > > > > > > > > > > > > > > From a first look, we should be able to use the PG_owner_priv_1 page > > > > > > > flag for network pages (see how this flag is overloaded, we can add a > > > > > > > PG_network alias). With this, we can identify the page in __put_page() > > > > > > > and __release_page(). These functions are already aware of different > > > > > > > types of pages and do their respective cleanup handling. We can > > > > > > > similarly make network a first-class citizen and hand pages back to > > > > > > > the network allocator from in there. > > > > > > > > > > > > For compound pages we have a concept of destructors. Maybe we can extend > > > > > > that for order-0 pages as well. The struct page is heavily packed and > > > > > > compound_dtor shares the storage without other metadata > > > > > > int pages; /* 16 4 */ > > > > > > unsigned char compound_dtor; /* 16 1 */ > > > > > > atomic_t hpage_pinned_refcount; /* 16 4 */ > > > > > > pgtable_t pmd_huge_pte; /* 16 8 */ > > > > > > void * zone_device_data; /* 16 8 */ > > > > > > > > > > > > But none of those should really require to be valid when a page is freed > > > > > > unless I am missing something. It would really require to check their > > > > > > users whether they can leave the state behind. But if we can establish a > > > > > > contract that compound_dtor can be always valid when a page is freed > > > > > > this would be really a nice and useful abstraction because you wouldn't > > > > > > have to care about the specific type of page. > > > > > > Yeah technically nobody should leave these fields behind, but it > > > sounds pretty awkward to manage an overloaded destructor with a > > > refcounted object: > > > > > > Either every put would have to check ref==1 before to see if it will > > > be the one to free the page, and then set up the destructor before > > > putting the final ref. But that means we can't support lockless > > > tryget() schemes like we have in the page cache with a destructor. > > I do not follow the ref==1 part. I mean to use the hugetlb model where > the destructore is configured for the whole lifetime until the page is > freed back to the allocator (see below). That only works if the destructor field doesn't overlap with a member the page type itself doesn't want to use. Page types that do want to use it would need to keep that field exclusive. We couldn't use it for LRU pages e.g. because it overlaps with the lru.next pointer. But if we bother with a 'generic' destructor for order-0 pages the LRU pages would actually be a prime candidate for a destructor: lru removal, memcg uncharging, page waiter cleanup... The field is also kind of wasteful. There are only a few different dtors but it occupies an entire byte. Page types don't necessarily have other bytes they could pair it with, so it might often take up a whole word. This is all fine with compound pages because we have all this space in the tail pages. It's not good in order-0 pages. So again, yes it would be nice to have generic destructors, but I just don't see how it's practical. Making network pages first-class objects instead makes a lot more sense to me for the time being. It's not like it's some small random driver usecase - it's the network stack! > > Ah, I think I see what you were getting at with your prior email - at > > first I thought your suggestion was that, since the driver may have > > its own refcount, every put would need to check ref == 1 and call into > > the driver if need be. > > > > Instead, and correct me if I'm wrong, it seems like what you're advocating is: > > 1) The (opted in) driver no longer hangs onto the ref, > > 2) Now refcount can go all the way to 0, > > 3) And when it does, due to the special destructor this page has, it > > goes back to the driver, rather than the system? Yes, correct. > > > > If you are going to claim a page flag then it would be much better to > > > > have it more generic. Flags are really scarce and if all you care about > > > > is PageHasDestructor() and provide one via page->dtor then the similar > > > > mechanism can be reused by somebody else. Or does anything prevent that? > > > > > > I was suggesting to alias PG_owner_priv_1, which currently isn't used > > > on network pages. We don't need to allocate a brandnew page flag. > > > > > > > Just to be certain, is there any danger of having a page, that would > > not be a network driver page originally, being inside __put_page(), > > such that PG_owner_priv_1 is set (but with one of its other overloaded > > meanings)? > > Yeah this is a real question. And from what Johannes is saying this > might pose some problem for this specific flags. I still need to check > all the users to be certain. One thing is clear though. PG_owner_priv_1 > is not a part of PAGE_FLAGS_CHECK_AT_FREE so it is not checked when a > page is freed so it is possible that the flag is left behind when > somebody does final put_page which makes things harder. The swapcache holds a reference, so PG_swapcache is never set on the final put. PG_checked refers to the data stored in file pages. It's possible not all filesystems clear it properly on truncate right now, but they don't need it once the data itself has become invalid. I double checked with Chris Mason. PG_pinned pages are Xen pagetables, they aren't actually refcounted. PG_xen_remapped pages are dma-remaps and aren't refcounted either. PG_foreign pages appear refcounted, but AFAICS the foreign map state is against a table that holds refs, so can't be set on put. I'll audit the PageChecked sites more closely and send a patch to tighten it up and add PG_owner_priv_1 to PAGE_FLAGS_CHECK_AT_FREE. That would be better anyway. > But that would be the case even when the flag is used for network > specific handling because you cannot tell it from other potential users > who are outside of networking... For non-refcounted pages, we'll catch it in the page allocator. For refcounted pages a bug is a bit trickier, but I don't think worse than the baseline risk of aliased page flags in the first place, and likely the network destructor would crash on a non-network page (assuming there is a driver-specific callback/ops struct in there).
On Thu 25-03-21 12:47:04, Johannes Weiner wrote: > On Thu, Mar 25, 2021 at 10:02:28AM +0100, Michal Hocko wrote: > > On Wed 24-03-21 15:49:15, Arjun Roy wrote: > > > On Wed, Mar 24, 2021 at 2:24 PM Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > > > > On Wed, Mar 24, 2021 at 10:12:46AM +0100, Michal Hocko wrote: > > > > > On Tue 23-03-21 11:47:54, Arjun Roy wrote: > > > > > > On Tue, Mar 23, 2021 at 7:34 AM Michal Hocko <mhocko@suse.com> wrote: > > > > > > > > > > > > > > On Wed 17-03-21 18:12:55, Johannes Weiner wrote: > > > > > > > [...] > > > > > > > > Here is an idea of how it could work: > > > > > > > > > > > > > > > > struct page already has > > > > > > > > > > > > > > > > struct { /* page_pool used by netstack */ > > > > > > > > /** > > > > > > > > * @dma_addr: might require a 64-bit value even on > > > > > > > > * 32-bit architectures. > > > > > > > > */ > > > > > > > > dma_addr_t dma_addr; > > > > > > > > }; > > > > > > > > > > > > > > > > and as you can see from its union neighbors, there is quite a bit more > > > > > > > > room to store private data necessary for the page pool. > > > > > > > > > > > > > > > > When a page's refcount hits zero and it's a networking page, we can > > > > > > > > feed it back to the page pool instead of the page allocator. > > > > > > > > > > > > > > > > From a first look, we should be able to use the PG_owner_priv_1 page > > > > > > > > flag for network pages (see how this flag is overloaded, we can add a > > > > > > > > PG_network alias). With this, we can identify the page in __put_page() > > > > > > > > and __release_page(). These functions are already aware of different > > > > > > > > types of pages and do their respective cleanup handling. We can > > > > > > > > similarly make network a first-class citizen and hand pages back to > > > > > > > > the network allocator from in there. > > > > > > > > > > > > > > For compound pages we have a concept of destructors. Maybe we can extend > > > > > > > that for order-0 pages as well. The struct page is heavily packed and > > > > > > > compound_dtor shares the storage without other metadata > > > > > > > int pages; /* 16 4 */ > > > > > > > unsigned char compound_dtor; /* 16 1 */ > > > > > > > atomic_t hpage_pinned_refcount; /* 16 4 */ > > > > > > > pgtable_t pmd_huge_pte; /* 16 8 */ > > > > > > > void * zone_device_data; /* 16 8 */ > > > > > > > > > > > > > > But none of those should really require to be valid when a page is freed > > > > > > > unless I am missing something. It would really require to check their > > > > > > > users whether they can leave the state behind. But if we can establish a > > > > > > > contract that compound_dtor can be always valid when a page is freed > > > > > > > this would be really a nice and useful abstraction because you wouldn't > > > > > > > have to care about the specific type of page. > > > > > > > > Yeah technically nobody should leave these fields behind, but it > > > > sounds pretty awkward to manage an overloaded destructor with a > > > > refcounted object: > > > > > > > > Either every put would have to check ref==1 before to see if it will > > > > be the one to free the page, and then set up the destructor before > > > > putting the final ref. But that means we can't support lockless > > > > tryget() schemes like we have in the page cache with a destructor. > > > > I do not follow the ref==1 part. I mean to use the hugetlb model where > > the destructore is configured for the whole lifetime until the page is > > freed back to the allocator (see below). > > That only works if the destructor field doesn't overlap with a member > the page type itself doesn't want to use. Page types that do want to > use it would need to keep that field exclusive. Right. > We couldn't use it for LRU pages e.g. because it overlaps with the > lru.next pointer. Dang, I have completely missed this. I was looking at pahole because struct page is unreadable in the C code but I tricked myself to only look at offset 16. The initial set of candidate looked really promissing. But overlapping with list_head is a deal breaker. This makes use of dtor for most order-0 pages indeed unfeasible. Maybe dtor can be rellocated but that is certain a rabbit hole people (rightfully) avoid as much as possible. So you are right and going with networking specific way is more reasonable. [...] > So again, yes it would be nice to have generic destructors, but I just > don't see how it's practical. just to clarify on this. I didn't really mean to use this mechanism to all/most pages I just wanted to have PageHasDestructor rather than PageNetwork because both would express a special nead for freeing but that would require that the dtor would be outside of lru. Thanks!
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e6dc793d587d..d67bc2aec7f6 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -349,11 +349,13 @@ extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an objcgs vector */ - MEMCG_DATA_OBJCGS = (1UL << 0), + MEMCG_DATA_OBJCGS = (1UL << 0), /* page has been accounted as a non-slab kernel page */ - MEMCG_DATA_KMEM = (1UL << 1), + MEMCG_DATA_KMEM = (1UL << 1), + /* page has been accounted as network memory */ + MEMCG_DATA_SOCK = (1UL << 2), /* the next bit after the last actual flag */ - __NR_MEMCG_DATA_FLAGS = (1UL << 2), + __NR_MEMCG_DATA_FLAGS = (1UL << 3), }; #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) @@ -444,6 +446,11 @@ static inline bool PageMemcgKmem(struct page *page) return page->memcg_data & MEMCG_DATA_KMEM; } +static inline bool PageMemcgSock(struct page *page) +{ + return page->memcg_data & MEMCG_DATA_SOCK; +} + #ifdef CONFIG_MEMCG_KMEM /* * page_objcgs - get the object cgroups vector associated with a page @@ -1093,6 +1100,11 @@ static inline bool PageMemcgKmem(struct page *page) return false; } +static inline bool PageMemcgSock(struct page *page) +{ + return false; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; @@ -1554,6 +1566,15 @@ extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); + +void mem_cgroup_post_charge_sock_pages(struct mem_cgroup *memcg, + unsigned int nr_pages); +void mem_cgroup_uncharge_sock_page(struct page *page); +int mem_cgroup_prepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, + u8 *page_prepared, unsigned int nr_pages); +int mem_cgroup_unprepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, + u8 *page_prepared, unsigned int nr_pages); + static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) @@ -1582,6 +1603,27 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } + +static inline void mem_cgroup_post_charge_sock_pages(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ +} + +static inline void mem_cgroup_uncharge_sock_page(struct page *page) +{ +} + +static inline int mem_cgroup_prepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, + u8 *page_prepared, unsigned int nr_pages) +{ + return 0; +} + +static inline int mem_cgroup_unprepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, + u8 *page_prepared, unsigned int nr_pages) +{ + return 0; +} #endif #ifdef CONFIG_MEMCG_KMEM diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 845eec01ef9d..aee126b0028c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7027,6 +7027,144 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) refill_stock(memcg, nr_pages); } +/** + * mem_cgroup_post_charge_sock_pages - charge socket memory + * for zerocopy pages mapped to userspace. + * @memcg: memcg to charge + * @nr_pages: number of pages + * + * When we perform a zero-copy receive on a socket, the receive payload memory + * pages are remapped to the user address space with vm_insert_pages(). + * mem_cgroup_post_charge_sock_pages() accounts for this memory utilization; it is + * *not* mem_cgroup_charge_skmem() which accounts for memory use within socket + * buffers. + * + * Charges all @nr_pages to given memcg. The caller should have a reference + * on the given memcg. Unlike mem_cgroup_charge_skmem, the caller must also have + * set page->memcg_data for these pages beforehand. Decoupling this page + * manipulation from the charging allows us to avoid double-charging the pages, + * causing undue memory pressure. + */ +void mem_cgroup_post_charge_sock_pages(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + if (mem_cgroup_disabled() || !memcg) + return; + try_charge(memcg, GFP_KERNEL | __GFP_NOFAIL, nr_pages); + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); +} + +/** + * mem_cgroup_uncharge_sock_page - uncharge socket page + * when unmapping zerocopy pages mapped to userspace. + * @page: page to uncharge + * + * mem_cgroup_uncharge_sock_page() drops the CSS reference taken by + * mem_cgroup_prepare_sock_pages(), and uncharges memory usage. + * Page cannot be compound. Must be called with page memcg lock held. + */ +void mem_cgroup_uncharge_sock_page(struct page *page) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON(PageCompound(page)); + memcg = page_memcg(page); + if (!memcg) + return; + + mod_memcg_state(memcg, MEMCG_SOCK, -1); + refill_stock(memcg, 1); + css_put(&memcg->css); + page->memcg_data = 0; +} + +/** + * mem_cgroup_unprepare_sock_pages - unset memcg for unremapped zerocopy pages. + * @memcg: The memcg we were trying to account pages to. + * @pages: array of pages, some subset of which we must 'unprepare' + * @page_prepared: array of flags indicating if page must be unprepared + * @nr_pages: Length of pages and page_prepared arrays. + * + * If a zerocopy receive attempt failed to remap some pages to userspace, we + * must unset memcg on these pages, if we had previously set them with a + * matching call to mem_cgroup_prepare_sock_pages(). + * + * The caller must ensure that all input parameters are the same parameters + * (or a subset of the parameters) passed to the preceding call to + * mem_cgroup_prepare_sock_pages() otherwise undefined behaviour results. + * Returns how many pages were unprepared so caller post-charges the right + * amount of pages to the memcg. + */ +int mem_cgroup_unprepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, + u8 *page_prepared, unsigned int nr_pages) +{ + unsigned int idx, cleared = 0; + + if (!memcg || mem_cgroup_disabled()) + return 0; + + for (idx = 0; idx < nr_pages; ++idx) { + if (!page_prepared[idx]) + continue; + /* We prepared this page so it is not LRU. */ + WRITE_ONCE(pages[idx]->memcg_data, 0); + ++cleared; + } + css_put_many(&memcg->css, cleared); + return cleared; +} + +/** + * mem_cgroup_prepare_sock_pages - set memcg for receive zerocopy pages. + * @memcg: The memcg we were trying to account pages to. + * @pages: array of pages, some subset of which we must 'prepare' + * @page_prepared: array of flags indicating if page was prepared + * @nr_pages: Length of pages and page_prepared arrays. + * + * If we are to remap pages to userspace in zerocopy receive, we must set the + * memcg for these pages before vm_insert_pages(), if the page does not already + * have a memcg set. However, if a memcg was already set, we do not adjust it. + * Explicitly track which pages we have prepared ourselves so we can 'unprepare' + * them if need be should page remapping fail. + * + * The caller must ensure that all input parameter passed to a subsequent call + * to mem_cgroup_unprepare_sock_pages() are identical or a subset of these + * parameters otherwise undefined behaviour results. page_prepared must also be + * zero'd by the caller before invoking this method. Returns how many pages were + * prepared so caller post-charges the right amount of pages to the memcg. + */ +int mem_cgroup_prepare_sock_pages(struct mem_cgroup *memcg, struct page **pages, + u8 *page_prepared, unsigned int nr_pages) +{ + unsigned int idx, to_uncharge = 0; + const unsigned long memcg_data = (unsigned long) memcg | + MEMCG_DATA_SOCK; + + if (!memcg || mem_cgroup_disabled()) + return 0; + + css_get_many(&memcg->css, nr_pages); + for (idx = 0; idx < nr_pages; ++idx) { + struct page *page = pages[idx]; + + VM_BUG_ON(PageCompound(page)); + /* + * page->memcg_data == 0 implies non-LRU page. non-LRU pages are + * not changed by the rest of the kernel, so we only have to + * guard against concurrent access in the networking layer. + * cmpxchg(0) lets us both validate non-LRU and protects against + * concurrent access in networking layer. + */ + if (cmpxchg(&page->memcg_data, 0, memcg_data) == 0) + page_prepared[idx] = 1; + else + ++to_uncharge; + } + if (to_uncharge) + css_put_many(&memcg->css, to_uncharge); + return nr_pages - to_uncharge; +} + static int __init cgroup_memory(char *s) { char *token; diff --git a/mm/rmap.c b/mm/rmap.c index b0fc27e77d6d..d2a164769dcf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1290,6 +1290,9 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (unlikely(PageMlocked(page))) clear_page_mlock(page); + + if (unlikely(PageMemcgSock(page))) + mem_cgroup_uncharge_sock_page(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1345,7 +1348,7 @@ static void page_remove_anon_compound_rmap(struct page *page) */ void page_remove_rmap(struct page *page, bool compound) { - lock_page_memcg(page); + struct mem_cgroup *memcg = lock_page_memcg(page); if (!PageAnon(page)) { page_remove_file_rmap(page, compound); @@ -1384,7 +1387,7 @@ void page_remove_rmap(struct page *page, bool compound) * faster for those pages still in swapcache. */ out: - unlock_page_memcg(page); + __unlock_page_memcg(memcg); } /* diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index de7cc8445ac0..17dd5b57409f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1789,12 +1789,14 @@ static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, ++frag; } *offset_frag = 0; + prefetchw(skb_frag_page(frag)); return frag; } static bool can_map_frag(const skb_frag_t *frag) { - return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag); + return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag) && + !PageCompound(skb_frag_page(frag)); } static int find_next_mappable_frag(const skb_frag_t *frag, @@ -1944,14 +1946,19 @@ static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, - unsigned long pages_remaining, + u8 *page_prepared, + unsigned long leftover_pages, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map, - int err) + int err, + unsigned long *pages_acctd_total, + struct mem_cgroup *memcg) { + unsigned long pages_remaining = leftover_pages, pages_mapped = 0; + /* At least one page did not map. Try zapping if we skipped earlier. */ if (err == -EBUSY && zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { @@ -1965,14 +1972,14 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, } if (!err) { - unsigned long leftover_pages = pages_remaining; int bytes_mapped; /* We called zap_page_range, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); - bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); + pages_mapped = leftover_pages - pages_remaining; + bytes_mapped = PAGE_SIZE * pages_mapped; *seq += bytes_mapped; *address += bytes_mapped; } @@ -1986,24 +1993,41 @@ static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, *length -= bytes_not_mapped; zc->recv_skip_hint += bytes_not_mapped; + + pending_pages += pages_mapped; + page_prepared += pages_mapped; + + /* For the pages that did not map, unset memcg and drop refs. */ + *pages_acctd_total -= mem_cgroup_unprepare_sock_pages(memcg, + pending_pages, + page_prepared, + pages_remaining); } return err; } static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, + u8 *page_prepared, unsigned int pages_to_map, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, - u32 total_bytes_to_map) + u32 total_bytes_to_map, + unsigned long *pages_acctd_total, + struct mem_cgroup *memcg) { unsigned long pages_remaining = pages_to_map; unsigned int pages_mapped; unsigned int bytes_mapped; int err; + /* Before mapping, we must take css ref since unmap drops it. */ + *pages_acctd_total = mem_cgroup_prepare_sock_pages(memcg, pages, + page_prepared, + pages_to_map); + err = vm_insert_pages(vma, *address, pages, &pages_remaining); pages_mapped = pages_to_map - (unsigned int)pages_remaining; bytes_mapped = PAGE_SIZE * pages_mapped; @@ -2018,8 +2042,9 @@ static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, /* Error: maybe zap and retry + rollback state for failed inserts. */ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, + page_prepared + pages_mapped, pages_remaining, address, length, seq, zc, total_bytes_to_map, - err); + err, pages_acctd_total, memcg); } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) @@ -2058,6 +2083,8 @@ static int tcp_zerocopy_receive(struct sock *sk, u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; + u8 page_prepared[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; + unsigned long total_pages_acctd = 0; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); const skb_frag_t *frags = NULL; @@ -2065,6 +2092,7 @@ static int tcp_zerocopy_receive(struct sock *sk, struct vm_area_struct *vma; struct sk_buff *skb = NULL; u32 seq = tp->copied_seq; + struct mem_cgroup *memcg; u32 total_bytes_to_map; int inq = tcp_inq(sk); int ret; @@ -2110,12 +2138,15 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->length = avail_len; zc->recv_skip_hint = avail_len; } + + memcg = get_mem_cgroup_from_mm(current->mm); ret = 0; while (length + PAGE_SIZE <= zc->length) { + bool skb_remainder_unmappable = zc->recv_skip_hint < PAGE_SIZE; int mappable_offset; struct page *page; - if (zc->recv_skip_hint < PAGE_SIZE) { + if (skb_remainder_unmappable) { u32 offset_frag; if (skb) { @@ -2144,30 +2175,46 @@ static int tcp_zerocopy_receive(struct sock *sk, break; } page = skb_frag_page(frags); - prefetchw(page); pages[pages_to_map++] = page; length += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; + skb_remainder_unmappable = zc->recv_skip_hint < PAGE_SIZE; if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || - zc->recv_skip_hint < PAGE_SIZE) { + skb_remainder_unmappable) { + unsigned long pages_acctd = 0; + /* Either full batch, or we're about to go to next skb * (and we cannot unroll failed ops across skbs). */ + memset(page_prepared, 0, sizeof(page_prepared)); ret = tcp_zerocopy_vm_insert_batch(vma, pages, + page_prepared, pages_to_map, &address, &length, &seq, zc, - total_bytes_to_map); + total_bytes_to_map, + &pages_acctd, + memcg); + total_pages_acctd += pages_acctd; if (ret) goto out; pages_to_map = 0; } + if (!skb_remainder_unmappable) + prefetchw(skb_frag_page(frags)); } if (pages_to_map) { - ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, - &address, &length, &seq, - zc, total_bytes_to_map); + unsigned long pages_acctd = 0; + + memset(page_prepared, 0, sizeof(page_prepared)); + ret = tcp_zerocopy_vm_insert_batch(vma, pages, + page_prepared, + pages_to_map, &address, + &length, &seq, zc, + total_bytes_to_map, + &pages_acctd, memcg); + total_pages_acctd += pages_acctd; } out: mmap_read_unlock(current->mm); @@ -2190,6 +2237,10 @@ static int tcp_zerocopy_receive(struct sock *sk, ret = -EIO; } zc->length = length; + + /* Account pages to user. */ + mem_cgroup_post_charge_sock_pages(memcg, total_pages_acctd); + mem_cgroup_put(memcg); return ret; } #endif