diff mbox series

[v2,1/1] swap: shmem: remove SWAP_MAP_SHMEM

Message ID 20241002012042.2753174-2-nphamcs@gmail.com (mailing list archive)
State New
Headers show
Series remove SWAP_MAP_SHMEM | expand

Commit Message

Nhat Pham Oct. 2, 2024, 1:20 a.m. UTC
The SWAP_MAP_SHMEM state was introduced in the commit aaa468653b4a
("swap_info: note SWAP_MAP_SHMEM"), to quickly determine if a swap entry
belongs to shmem during swapoff.

However, swapoff has since been rewritten in the commit b56a2d8af914
("mm: rid swapoff of quadratic complexity"). Now having swap count ==
SWAP_MAP_SHMEM value is basically the same as having swap count == 1,
and swap_shmem_alloc() behaves analogously to swap_duplicate(). The only
difference of note is that swap_shmem_alloc() does not check for
-ENOMEM returned from __swap_duplicate(), but it is OK because shmem
never re-duplicates any swap entry it owns. This will stil be safe if we
use (batched) swap_duplicate() instead.

This commit adds swap_duplicate_nr(), the batched variant of
swap_duplicate(), and removes the SWAP_MAP_SHMEM state and the
associated swap_shmem_alloc() helper to simplify the state machine (both
mentally and in terms of actual code). We will also have an extra
state/special value that can be repurposed (for swap entries that never
gets re-duplicated).

Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 include/linux/swap.h | 16 ++++++++--------
 mm/shmem.c           |  2 +-
 mm/swapfile.c        | 41 +++++++++++++++++++++--------------------
 3 files changed, 30 insertions(+), 29 deletions(-)

Comments

Yosry Ahmed Oct. 2, 2024, 1:33 a.m. UTC | #1
On Tue, Oct 1, 2024 at 6:20 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> The SWAP_MAP_SHMEM state was introduced in the commit aaa468653b4a
> ("swap_info: note SWAP_MAP_SHMEM"), to quickly determine if a swap entry
> belongs to shmem during swapoff.
>
> However, swapoff has since been rewritten in the commit b56a2d8af914
> ("mm: rid swapoff of quadratic complexity"). Now having swap count ==
> SWAP_MAP_SHMEM value is basically the same as having swap count == 1,
> and swap_shmem_alloc() behaves analogously to swap_duplicate(). The only
> difference of note is that swap_shmem_alloc() does not check for
> -ENOMEM returned from __swap_duplicate(), but it is OK because shmem
> never re-duplicates any swap entry it owns. This will stil be safe if we
> use (batched) swap_duplicate() instead.
>
> This commit adds swap_duplicate_nr(), the batched variant of
> swap_duplicate(), and removes the SWAP_MAP_SHMEM state and the
> associated swap_shmem_alloc() helper to simplify the state machine (both
> mentally and in terms of actual code). We will also have an extra
> state/special value that can be repurposed (for swap entries that never
> gets re-duplicated).
>
> Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> ---
>  include/linux/swap.h | 16 ++++++++--------
>  mm/shmem.c           |  2 +-
>  mm/swapfile.c        | 41 +++++++++++++++++++++--------------------
>  3 files changed, 30 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index ca533b478c21..017f3c03ff7a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -232,7 +232,6 @@ enum {
>  /* Special value in first swap_map */
>  #define SWAP_MAP_MAX   0x3e    /* Max count */
>  #define SWAP_MAP_BAD   0x3f    /* Note page is bad */
> -#define SWAP_MAP_SHMEM 0xbf    /* Owned by shmem/tmpfs */
>
>  /* Special value in each swap_map continuation */
>  #define SWAP_CONT_MAX  0x7f    /* Max count */
> @@ -482,8 +481,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry);
>  extern swp_entry_t get_swap_page_of_type(int);
>  extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
>  extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> -extern void swap_shmem_alloc(swp_entry_t, int);
> -extern int swap_duplicate(swp_entry_t);
> +extern int swap_duplicate_nr(swp_entry_t, int);
>  extern int swapcache_prepare(swp_entry_t entry, int nr);
>  extern void swap_free_nr(swp_entry_t entry, int nr_pages);
>  extern void swapcache_free_entries(swp_entry_t *entries, int n);
> @@ -549,11 +547,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
>         return 0;
>  }
>
> -static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
> -{
> -}
> -
> -static inline int swap_duplicate(swp_entry_t swp)
> +static inline int swap_duplicate_nr(swp_entry_t swp, int nr)
>  {
>         return 0;
>  }
> @@ -606,6 +600,12 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>  }
>  #endif /* CONFIG_SWAP */
>
> +static inline int swap_duplicate(swp_entry_t entry)
> +{
> +       return swap_duplicate_nr(entry, 1);
> +}
> +
> +

Nit: extra blank line.

>  static inline void free_swap_and_cache(swp_entry_t entry)
>  {
>         free_swap_and_cache_nr(entry, 1);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 0613421e09e7..e3f72f99be32 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1561,7 +1561,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
>                         __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
>                         NULL) == 0) {
>                 shmem_recalc_inode(inode, 0, nr_pages);
> -               swap_shmem_alloc(swap, nr_pages);
> +               swap_duplicate_nr(swap, nr_pages);
>                 shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
>
>                 mutex_unlock(&shmem_swaplist_mutex);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 0cded32414a1..9bb94e618914 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1381,12 +1381,6 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
>         if (usage == SWAP_HAS_CACHE) {
>                 VM_BUG_ON(!has_cache);
>                 has_cache = 0;
> -       } else if (count == SWAP_MAP_SHMEM) {
> -               /*
> -                * Or we could insist on shmem.c using a special
> -                * swap_shmem_free() and free_shmem_swap_and_cache()...
> -                */
> -               count = 0;
>         } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
>                 if (count == COUNT_CONTINUED) {
>                         if (swap_count_continued(si, offset, count))
> @@ -3626,7 +3620,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
>
>         offset = swp_offset(entry);
>         VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
> -       VM_WARN_ON(usage == 1 && nr > 1);
>         ci = lock_cluster_or_swap_info(si, offset);
>
>         err = 0;
> @@ -3652,6 +3645,13 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
>                                 err = -EEXIST;
>                 } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
>                         err = -EINVAL;
> +               } else {
> +                       /*
> +                        * The only swap_duplicate_nr() caller that passes nr > 1 is shmem,
> +                        * who never re-duplicates any swap entry it owns. So this should

nit: I think "which" is the right word here, but I am not a native speaker :)

> +                        * not happen.
> +                        */
> +                       VM_WARN_ON(nr > 1 && (count & ~COUNT_CONTINUED) == SWAP_MAP_MAX);

Why not return an error in this case? I think we should add recovery
for bugs when it's possible and simple, which I believe is the case
here.

In shmem_writepage() we can add a WARN if swap_duplicate_nr() fails,
or propagate an error to the caller as well (perhaps this belongs in a
separate patch that does this for swap_shmem_alloc() first).

Sorry if I am being paranoid here, please let me know if this is the case.

>                 }
>
>                 if (err)
> @@ -3686,27 +3686,28 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
>         return err;
>  }
>
> -/*
> - * Help swapoff by noting that swap entry belongs to shmem/tmpfs
> - * (in which case its reference count is never incremented).
> - */
> -void swap_shmem_alloc(swp_entry_t entry, int nr)
> -{
> -       __swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
> -}
> -
> -/*
> - * Increase reference count of swap entry by 1.
> +/**
> + * swap_duplicate_nr() - Increase reference count of nr contiguous swap entries
> + *                       by 1.

Can we avoid the line break by using "refcount" instead of "reference count"?

> + *
> + * @entry: first swap entry from which we want to increase the refcount.
> + * @nr: Number of entries in range.
> + *
>   * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
>   * but could not be atomically allocated.  Returns 0, just as if it succeeded,
>   * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
>   * might occur if a page table entry has got corrupted.
> + *
> + * Note that we are currently not handling the case where nr > 1 and we need to
> + * add swap count continuation. This is OK, because no such user exists - shmem
> + * is the only user that can pass nr > 1, and it never re-duplicates any swap
> + * entry it owns.

Do we need this comment when we have the WARN + comment in __swap_duplicate()?

>   */
> -int swap_duplicate(swp_entry_t entry)
> +int swap_duplicate_nr(swp_entry_t entry, int nr)
>  {
>         int err = 0;
>
> -       while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
> +       while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
>                 err = add_swap_count_continuation(entry, GFP_ATOMIC);
>         return err;
>  }
> --
> 2.43.5
Nhat Pham Oct. 2, 2024, 1:58 a.m. UTC | #2
On Tue, Oct 1, 2024 at 6:33 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
>
> Nit: extra blank line.
>
> >  static inline void free_swap_and_cache(swp_entry_t entry)
> >  {
> >         free_swap_and_cache_nr(entry, 1);
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 0613421e09e7..e3f72f99be32 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -1561,7 +1561,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
> >                         __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
> >                         NULL) == 0) {
> >                 shmem_recalc_inode(inode, 0, nr_pages);
> > -               swap_shmem_alloc(swap, nr_pages);
> > +               swap_duplicate_nr(swap, nr_pages);
> >                 shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
> >
> >                 mutex_unlock(&shmem_swaplist_mutex);
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 0cded32414a1..9bb94e618914 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -1381,12 +1381,6 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
> >         if (usage == SWAP_HAS_CACHE) {
> >                 VM_BUG_ON(!has_cache);
> >                 has_cache = 0;
> > -       } else if (count == SWAP_MAP_SHMEM) {
> > -               /*
> > -                * Or we could insist on shmem.c using a special
> > -                * swap_shmem_free() and free_shmem_swap_and_cache()...
> > -                */
> > -               count = 0;
> >         } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
> >                 if (count == COUNT_CONTINUED) {
> >                         if (swap_count_continued(si, offset, count))
> > @@ -3626,7 +3620,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
> >
> >         offset = swp_offset(entry);
> >         VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
> > -       VM_WARN_ON(usage == 1 && nr > 1);
> >         ci = lock_cluster_or_swap_info(si, offset);
> >
> >         err = 0;
> > @@ -3652,6 +3645,13 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
> >                                 err = -EEXIST;
> >                 } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
> >                         err = -EINVAL;
> > +               } else {
> > +                       /*
> > +                        * The only swap_duplicate_nr() caller that passes nr > 1 is shmem,
> > +                        * who never re-duplicates any swap entry it owns. So this should
>
> nit: I think "which" is the right word here, but I am not a native speaker :)

Yeah I think it should be which. Fix(let) incoming.

>
> > +                        * not happen.
> > +                        */
> > +                       VM_WARN_ON(nr > 1 && (count & ~COUNT_CONTINUED) == SWAP_MAP_MAX);
>
> Why not return an error in this case? I think we should add recovery
> for bugs when it's possible and simple, which I believe is the case
> here.
>
> In shmem_writepage() we can add a WARN if swap_duplicate_nr() fails,
> or propagate an error to the caller as well (perhaps this belongs in a
> separate patch that does this for swap_shmem_alloc() first).
>
> Sorry if I am being paranoid here, please let me know if this is the case.

I was debating between WARN-ing here, and returning -ENOMEM and
WARN-ing at shmem's callsite.

My thinking is that if we return -ENOMEM here, it will work in the
current setup, for both shmem and other callsites. However, in the
future, if we add another user of swap_duplicate_nr(), this time
without guaranteeing that we won't need continuation, I think it won't
work unless we have the fallback logic in place as well:

while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
err = add_swap_count_continuation(entry, GFP_ATOMIC);

>
> >                 }
> >
> >                 if (err)
> > @@ -3686,27 +3686,28 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
> >         return err;
> >  }
> >
> > -/*
> > - * Help swapoff by noting that swap entry belongs to shmem/tmpfs
> > - * (in which case its reference count is never incremented).
> > - */
> > -void swap_shmem_alloc(swp_entry_t entry, int nr)
> > -{
> > -       __swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
> > -}
> > -
> > -/*
> > - * Increase reference count of swap entry by 1.
> > +/**
> > + * swap_duplicate_nr() - Increase reference count of nr contiguous swap entries
> > + *                       by 1.
>
> Can we avoid the line break by using "refcount" instead of "reference count"?
>
> > + *
> > + * @entry: first swap entry from which we want to increase the refcount.
> > + * @nr: Number of entries in range.
> > + *
> >   * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
> >   * but could not be atomically allocated.  Returns 0, just as if it succeeded,
> >   * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
> >   * might occur if a page table entry has got corrupted.
> > + *
> > + * Note that we are currently not handling the case where nr > 1 and we need to
> > + * add swap count continuation. This is OK, because no such user exists - shmem
> > + * is the only user that can pass nr > 1, and it never re-duplicates any swap
> > + * entry it owns.
>
> Do we need this comment when we have the WARN + comment in __swap_duplicate()?

Here I'm just being cautious and include the limitation of the
function in the API documentation itself.

No strong opinions though.

>
> >   */
> > -int swap_duplicate(swp_entry_t entry)
> > +int swap_duplicate_nr(swp_entry_t entry, int nr)
> >  {
> >         int err = 0;
> >
> > -       while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
> > +       while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
> >                 err = add_swap_count_continuation(entry, GFP_ATOMIC);
> >         return err;
> >  }
> > --
> > 2.43.5
Nhat Pham Oct. 2, 2024, 2:04 a.m. UTC | #3
On Tue, Oct 1, 2024 at 6:58 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Tue, Oct 1, 2024 at 6:33 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> I was debating between WARN-ing here, and returning -ENOMEM and
> WARN-ing at shmem's callsite.
>
> My thinking is that if we return -ENOMEM here, it will work in the
> current setup, for both shmem and other callsites. However, in the
> future, if we add another user of swap_duplicate_nr(), this time
> without guaranteeing that we won't need continuation, I think it won't
> work unless we have the fallback logic in place as well:
>
> while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
> err = add_swap_count_continuation(entry, GFP_ATOMIC);

Sorry, I accidentally sent out the email without completing my explanation :)

Anyway, the point being, with the current implementation, any new user
would immediately hit a WARN and the implementer will know to check.

Whereas if we return -ENOMEM in __swap_duplicate(), then I think we
would just hang, no? We only try to add swap count continuation to the
first entry only, which is not sufficient to fix the problem.

I can probably whip up the fallback logic here, but it would be dead,
untestable code (as it has no users, and I cannot even conceive one to
test it). And the swap abstraction might render all of this moot
anyway.
Yosry Ahmed Oct. 2, 2024, 2:06 a.m. UTC | #4
On Tue, Oct 1, 2024 at 7:04 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Tue, Oct 1, 2024 at 6:58 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Tue, Oct 1, 2024 at 6:33 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > I was debating between WARN-ing here, and returning -ENOMEM and
> > WARN-ing at shmem's callsite.
> >
> > My thinking is that if we return -ENOMEM here, it will work in the
> > current setup, for both shmem and other callsites. However, in the
> > future, if we add another user of swap_duplicate_nr(), this time
> > without guaranteeing that we won't need continuation, I think it won't
> > work unless we have the fallback logic in place as well:
> >
> > while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
> > err = add_swap_count_continuation(entry, GFP_ATOMIC);
>
> Sorry, I accidentally sent out the email without completing my explanation :)
>
> Anyway, the point being, with the current implementation, any new user
> would immediately hit a WARN and the implementer will know to check.
>
> Whereas if we return -ENOMEM in __swap_duplicate(), then I think we
> would just hang, no? We only try to add swap count continuation to the
> first entry only, which is not sufficient to fix the problem.
>
> I can probably whip up the fallback logic here, but it would be dead,
> untestable code (as it has no users, and I cannot even conceive one to
> test it). And the swap abstraction might render all of this moot
> anyway.

What I had in mind is not returning -ENOMEM at all, but something like
-EOPNOTSUPP. The swap_duplicate_nr() will just return the error to the
caller. All callers of swap_duplicate() and swap_duplicate_nr()
currently check the error except shmem.
Yosry Ahmed Oct. 2, 2024, 2:11 a.m. UTC | #5
[..]
> > > + *
> > > + * @entry: first swap entry from which we want to increase the refcount.
> > > + * @nr: Number of entries in range.
> > > + *
> > >   * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
> > >   * but could not be atomically allocated.  Returns 0, just as if it succeeded,
> > >   * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
> > >   * might occur if a page table entry has got corrupted.
> > > + *
> > > + * Note that we are currently not handling the case where nr > 1 and we need to
> > > + * add swap count continuation. This is OK, because no such user exists - shmem
> > > + * is the only user that can pass nr > 1, and it never re-duplicates any swap
> > > + * entry it owns.
> >
> > Do we need this comment when we have the WARN + comment in __swap_duplicate()?
>
> Here I'm just being cautious and include the limitation of the
> function in the API documentation itself.
>
> No strong opinions though.

Maybe it would be more useful to add a warning in the loop if nr > 1,
with a comment that explains that the current -ENOMEM handling does
not properly handle nr > 1?

> >
> > >   */
> > > -int swap_duplicate(swp_entry_t entry)
> > > +int swap_duplicate_nr(swp_entry_t entry, int nr)
> > >  {
> > >         int err = 0;
> > >
> > > -       while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
> > > +       while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
> > >                 err = add_swap_count_continuation(entry, GFP_ATOMIC);
> > >         return err;
> > >  }
> > > --
> > > 2.43.5
Yosry Ahmed Oct. 2, 2024, 2:13 a.m. UTC | #6
On Tue, Oct 1, 2024 at 7:06 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Oct 1, 2024 at 7:04 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Tue, Oct 1, 2024 at 6:58 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Tue, Oct 1, 2024 at 6:33 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > I was debating between WARN-ing here, and returning -ENOMEM and
> > > WARN-ing at shmem's callsite.
> > >
> > > My thinking is that if we return -ENOMEM here, it will work in the
> > > current setup, for both shmem and other callsites. However, in the
> > > future, if we add another user of swap_duplicate_nr(), this time
> > > without guaranteeing that we won't need continuation, I think it won't
> > > work unless we have the fallback logic in place as well:
> > >
> > > while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
> > > err = add_swap_count_continuation(entry, GFP_ATOMIC);
> >
> > Sorry, I accidentally sent out the email without completing my explanation :)
> >
> > Anyway, the point being, with the current implementation, any new user
> > would immediately hit a WARN and the implementer will know to check.
> >
> > Whereas if we return -ENOMEM in __swap_duplicate(), then I think we
> > would just hang, no? We only try to add swap count continuation to the
> > first entry only, which is not sufficient to fix the problem.
> >
> > I can probably whip up the fallback logic here, but it would be dead,
> > untestable code (as it has no users, and I cannot even conceive one to
> > test it). And the swap abstraction might render all of this moot
> > anyway.
>
> What I had in mind is not returning -ENOMEM at all, but something like
> -EOPNOTSUPP. The swap_duplicate_nr() will just return the error to the
> caller. All callers of swap_duplicate() and swap_duplicate_nr()
> currently check the error except shmem.

..and just to be extra clear, I meant WARN _and_ return -EOPNOTSUPP.
Nhat Pham Oct. 2, 2024, 6:01 p.m. UTC | #7
On Tue, Oct 1, 2024 at 7:14 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Oct 1, 2024 at 7:06 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Oct 1, 2024 at 7:04 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Tue, Oct 1, 2024 at 6:58 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > > >
> > > > On Tue, Oct 1, 2024 at 6:33 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > >
> > > > I was debating between WARN-ing here, and returning -ENOMEM and
> > > > WARN-ing at shmem's callsite.
> > > >
> > > > My thinking is that if we return -ENOMEM here, it will work in the
> > > > current setup, for both shmem and other callsites. However, in the
> > > > future, if we add another user of swap_duplicate_nr(), this time
> > > > without guaranteeing that we won't need continuation, I think it won't
> > > > work unless we have the fallback logic in place as well:
> > > >
> > > > while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
> > > > err = add_swap_count_continuation(entry, GFP_ATOMIC);
> > >
> > > Sorry, I accidentally sent out the email without completing my explanation :)
> > >
> > > Anyway, the point being, with the current implementation, any new user
> > > would immediately hit a WARN and the implementer will know to check.
> > >
> > > Whereas if we return -ENOMEM in __swap_duplicate(), then I think we
> > > would just hang, no? We only try to add swap count continuation to the
> > > first entry only, which is not sufficient to fix the problem.
> > >
> > > I can probably whip up the fallback logic here, but it would be dead,
> > > untestable code (as it has no users, and I cannot even conceive one to
> > > test it). And the swap abstraction might render all of this moot
> > > anyway.
> >
> > What I had in mind is not returning -ENOMEM at all, but something like
> > -EOPNOTSUPP. The swap_duplicate_nr() will just return the error to the
> > caller. All callers of swap_duplicate() and swap_duplicate_nr()
> > currently check the error except shmem.
>
> ..and just to be extra clear, I meant WARN _and_ return -EOPNOTSUPP.

Ah ok this makes a lot of sense actually.

I'll return -EOPNOTSUPP here. Do you think warn within
__swap_duplicate() makes more sense, or at shmem's callsite make more
sense?

I feel like we should warn within __swap_duplicate callsite. That way
if we accidentally screw up for other swap_duplicaters in the future,
the feedback will be immediate :)
Yosry Ahmed Oct. 2, 2024, 6:06 p.m. UTC | #8
On Wed, Oct 2, 2024 at 11:01 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Tue, Oct 1, 2024 at 7:14 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Oct 1, 2024 at 7:06 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > On Tue, Oct 1, 2024 at 7:04 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > > >
> > > > On Tue, Oct 1, 2024 at 6:58 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > > > >
> > > > > On Tue, Oct 1, 2024 at 6:33 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > > >
> > > > > I was debating between WARN-ing here, and returning -ENOMEM and
> > > > > WARN-ing at shmem's callsite.
> > > > >
> > > > > My thinking is that if we return -ENOMEM here, it will work in the
> > > > > current setup, for both shmem and other callsites. However, in the
> > > > > future, if we add another user of swap_duplicate_nr(), this time
> > > > > without guaranteeing that we won't need continuation, I think it won't
> > > > > work unless we have the fallback logic in place as well:
> > > > >
> > > > > while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
> > > > > err = add_swap_count_continuation(entry, GFP_ATOMIC);
> > > >
> > > > Sorry, I accidentally sent out the email without completing my explanation :)
> > > >
> > > > Anyway, the point being, with the current implementation, any new user
> > > > would immediately hit a WARN and the implementer will know to check.
> > > >
> > > > Whereas if we return -ENOMEM in __swap_duplicate(), then I think we
> > > > would just hang, no? We only try to add swap count continuation to the
> > > > first entry only, which is not sufficient to fix the problem.
> > > >
> > > > I can probably whip up the fallback logic here, but it would be dead,
> > > > untestable code (as it has no users, and I cannot even conceive one to
> > > > test it). And the swap abstraction might render all of this moot
> > > > anyway.
> > >
> > > What I had in mind is not returning -ENOMEM at all, but something like
> > > -EOPNOTSUPP. The swap_duplicate_nr() will just return the error to the
> > > caller. All callers of swap_duplicate() and swap_duplicate_nr()
> > > currently check the error except shmem.
> >
> > ..and just to be extra clear, I meant WARN _and_ return -EOPNOTSUPP.
>
> Ah ok this makes a lot of sense actually.
>
> I'll return -EOPNOTSUPP here. Do you think warn within
> __swap_duplicate() makes more sense, or at shmem's callsite make more
> sense?
>
> I feel like we should warn within __swap_duplicate callsite. That way
> if we accidentally screw up for other swap_duplicaters in the future,
> the feedback will be immediate :)

I think we should warn in __swap_duplicate(). We can also propagate
the error from shmem_writepage() to the caller, but I think this may
need extra cleanup to be properly handled, didn't look too closely.

We can also warn in swap_duplicate_nr() if we ever reach the -ENOMEM
fallback code with nr > 1, and document there that the current
fallback logic does not handle this case (instead of documenting it
above the function). This will make sure we never return -ENOMEM from
__swap_duplicate() incorrectly.
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ca533b478c21..017f3c03ff7a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -232,7 +232,6 @@  enum {
 /* Special value in first swap_map */
 #define SWAP_MAP_MAX	0x3e	/* Max count */
 #define SWAP_MAP_BAD	0x3f	/* Note page is bad */
-#define SWAP_MAP_SHMEM	0xbf	/* Owned by shmem/tmpfs */
 
 /* Special value in each swap_map continuation */
 #define SWAP_CONT_MAX	0x7f	/* Max count */
@@ -482,8 +481,7 @@  void put_swap_folio(struct folio *folio, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
-extern void swap_shmem_alloc(swp_entry_t, int);
-extern int swap_duplicate(swp_entry_t);
+extern int swap_duplicate_nr(swp_entry_t, int);
 extern int swapcache_prepare(swp_entry_t entry, int nr);
 extern void swap_free_nr(swp_entry_t entry, int nr_pages);
 extern void swapcache_free_entries(swp_entry_t *entries, int n);
@@ -549,11 +547,7 @@  static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 	return 0;
 }
 
-static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
-{
-}
-
-static inline int swap_duplicate(swp_entry_t swp)
+static inline int swap_duplicate_nr(swp_entry_t swp, int nr)
 {
 	return 0;
 }
@@ -606,6 +600,12 @@  static inline int add_swap_extent(struct swap_info_struct *sis,
 }
 #endif /* CONFIG_SWAP */
 
+static inline int swap_duplicate(swp_entry_t entry)
+{
+	return swap_duplicate_nr(entry, 1);
+}
+
+
 static inline void free_swap_and_cache(swp_entry_t entry)
 {
 	free_swap_and_cache_nr(entry, 1);
diff --git a/mm/shmem.c b/mm/shmem.c
index 0613421e09e7..e3f72f99be32 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1561,7 +1561,7 @@  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 			__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
 			NULL) == 0) {
 		shmem_recalc_inode(inode, 0, nr_pages);
-		swap_shmem_alloc(swap, nr_pages);
+		swap_duplicate_nr(swap, nr_pages);
 		shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
 
 		mutex_unlock(&shmem_swaplist_mutex);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0cded32414a1..9bb94e618914 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1381,12 +1381,6 @@  static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
 	if (usage == SWAP_HAS_CACHE) {
 		VM_BUG_ON(!has_cache);
 		has_cache = 0;
-	} else if (count == SWAP_MAP_SHMEM) {
-		/*
-		 * Or we could insist on shmem.c using a special
-		 * swap_shmem_free() and free_shmem_swap_and_cache()...
-		 */
-		count = 0;
 	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
 		if (count == COUNT_CONTINUED) {
 			if (swap_count_continued(si, offset, count))
@@ -3626,7 +3620,6 @@  static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
 
 	offset = swp_offset(entry);
 	VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
-	VM_WARN_ON(usage == 1 && nr > 1);
 	ci = lock_cluster_or_swap_info(si, offset);
 
 	err = 0;
@@ -3652,6 +3645,13 @@  static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
 				err = -EEXIST;
 		} else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
 			err = -EINVAL;
+		} else {
+			/*
+			 * The only swap_duplicate_nr() caller that passes nr > 1 is shmem,
+			 * who never re-duplicates any swap entry it owns. So this should
+			 * not happen.
+			 */
+			VM_WARN_ON(nr > 1 && (count & ~COUNT_CONTINUED) == SWAP_MAP_MAX);
 		}
 
 		if (err)
@@ -3686,27 +3686,28 @@  static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
 	return err;
 }
 
-/*
- * Help swapoff by noting that swap entry belongs to shmem/tmpfs
- * (in which case its reference count is never incremented).
- */
-void swap_shmem_alloc(swp_entry_t entry, int nr)
-{
-	__swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
-}
-
-/*
- * Increase reference count of swap entry by 1.
+/**
+ * swap_duplicate_nr() - Increase reference count of nr contiguous swap entries
+ *                       by 1.
+ *
+ * @entry: first swap entry from which we want to increase the refcount.
+ * @nr: Number of entries in range.
+ *
  * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
  * but could not be atomically allocated.  Returns 0, just as if it succeeded,
  * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
  * might occur if a page table entry has got corrupted.
+ *
+ * Note that we are currently not handling the case where nr > 1 and we need to
+ * add swap count continuation. This is OK, because no such user exists - shmem
+ * is the only user that can pass nr > 1, and it never re-duplicates any swap
+ * entry it owns.
  */
-int swap_duplicate(swp_entry_t entry)
+int swap_duplicate_nr(swp_entry_t entry, int nr)
 {
 	int err = 0;
 
-	while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
+	while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
 		err = add_swap_count_continuation(entry, GFP_ATOMIC);
 	return err;
 }