@@ -307,7 +307,7 @@ struct swap_info_struct {
/* list of cluster that are fragmented or contented */
unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
unsigned int pages; /* total of usable pages of swap */
- unsigned int inuse_pages; /* number of those currently in use */
+ atomic_long_t inuse_pages; /* number of those currently in use */
struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
struct rb_root swap_extent_root;/* root of the swap extent rbtree */
struct block_device *bdev; /* swap device or bdev of swap file */
@@ -128,6 +128,26 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
+/*
+ * Use the second highest bit of inuse_pages counter as the indicator
+ * of if one swap device is on the available plist, so the atomic can
+ * still be updated arithmetic while having special data embedded.
+ *
+ * inuse_pages counter is the only thing indicating if a device should
+ * be on avail_lists or not (except swapon / swapoff). By embedding the
+ * on-list bit in the atomic counter, updates no longer need any lock
+ * to check the list status.
+ *
+ * This bit will be set if the device is not on the plist and not
+ * usable, will be cleared if the device is on the plist.
+ */
+#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
+#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
+static long swap_usage_in_pages(struct swap_info_struct *si)
+{
+ return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
+}
+
/* Reclaim the swap entry anyway if possible */
#define TTRS_ANYWAY 0x1
/*
@@ -717,7 +737,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
int nr_reclaim;
if (force)
- to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
+ to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
while (!list_empty(&si->full_clusters)) {
ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
@@ -872,42 +892,124 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
return found;
}
-static void __del_from_avail_list(struct swap_info_struct *si)
+/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
+static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{
int nid;
- assert_spin_locked(&si->lock);
+ spin_lock(&swap_avail_lock);
+
+ if (swapoff) {
+ /*
+ * Forcefully remove it. Clear the SWP_WRITEOK flags for
+ * swapoff here so it's synchronized by both si->lock and
+ * swap_avail_lock, to ensure the result can be seen by
+ * add_to_avail_list.
+ */
+ lockdep_assert_held(&si->lock);
+ si->flags &= ~SWP_WRITEOK;
+ atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
+ } else {
+ /*
+ * If not called by swapoff, take it off-list only if it's
+ * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
+ * si->inuse_pages == pages), any concurrent slot freeing,
+ * or device already removed from plist by someone else
+ * will make this return false.
+ */
+ if (atomic_long_cmpxchg(&si->inuse_pages, si->pages,
+ si->pages | SWAP_USAGE_OFFLIST_BIT) != si->pages)
+ goto skip;
+ }
+
for_each_node(nid)
plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
+
+skip:
+ spin_unlock(&swap_avail_lock);
}
-static void del_from_avail_list(struct swap_info_struct *si)
+/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
+static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
{
+ int nid;
+ long val;
+
spin_lock(&swap_avail_lock);
- __del_from_avail_list(si);
+
+ /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
+ if (swapon) {
+ lockdep_assert_held(&si->lock);
+ si->flags |= SWP_WRITEOK;
+ } else {
+ if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
+ goto skip;
+ }
+
+ if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+
+ val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
+
+ /*
+ * When device is full and device is on the plist, only one updater will
+ * see (inuse_pages == si->pages) and will call del_from_avail_list. If
+ * that updater happen to be here, just skip adding.
+ */
+ if (val == si->pages) {
+ /* Just like the cmpxchg in del_from_avail_list */
+ if (atomic_long_cmpxchg(&si->inuse_pages, si->pages,
+ si->pages | SWAP_USAGE_OFFLIST_BIT) == si->pages)
+ goto skip;
+ }
+
+ for_each_node(nid)
+ plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
+
+skip:
spin_unlock(&swap_avail_lock);
}
-static void swap_range_alloc(struct swap_info_struct *si,
- unsigned int nr_entries)
+/*
+ * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
+ * within each cluster, so the total contribution to the global counter should
+ * always be positive and cannot exceed the total number of usable slots.
+ */
+static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
{
- WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
- if (si->inuse_pages == si->pages) {
- del_from_avail_list(si);
+ long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
- if (si->cluster_info && vm_swap_full())
- schedule_work(&si->reclaim_work);
+ /*
+ * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
+ * remove it from the plist.
+ */
+ if (unlikely(val == si->pages)) {
+ del_from_avail_list(si, false);
+ return true;
}
+
+ return false;
}
-static void add_to_avail_list(struct swap_info_struct *si)
+static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
{
- int nid;
+ long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
- spin_lock(&swap_avail_lock);
- for_each_node(nid)
- plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
- spin_unlock(&swap_avail_lock);
+ /*
+ * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
+ * remove it from the plist.
+ */
+ if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
+ add_to_avail_list(si, false);
+}
+
+static void swap_range_alloc(struct swap_info_struct *si,
+ unsigned int nr_entries)
+{
+ if (swap_usage_add(si, nr_entries)) {
+ if (si->cluster_info && vm_swap_full())
+ schedule_work(&si->reclaim_work);
+ }
}
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -925,8 +1027,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
for (i = 0; i < nr_entries; i++)
clear_bit(offset + i, si->zeromap);
- if (si->inuse_pages == si->pages)
- add_to_avail_list(si);
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -946,7 +1046,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
*/
smp_wmb();
atomic_long_add(nr_entries, &nr_swap_pages);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
+ swap_usage_sub(si, nr_entries);
}
static int cluster_alloc_swap(struct swap_info_struct *si,
@@ -1036,19 +1136,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
- if ((si->inuse_pages == si->pages) || !(si->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- if (plist_node_empty(&si->avail_lists[node])) {
- spin_unlock(&si->lock);
- goto nextsi;
- }
- WARN(!(si->flags & SWP_WRITEOK),
- "swap_info %d in list but !SWP_WRITEOK\n",
- si->type);
- __del_from_avail_list(si);
- spin_unlock(&si->lock);
- goto nextsi;
- }
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries, order);
spin_unlock(&si->lock);
@@ -1057,7 +1144,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
cond_resched();
spin_lock(&swap_avail_lock);
-nextsi:
/*
* if we got here, it's likely that si was almost full before,
* and since scan_swap_map_slots() can drop the si->lock,
@@ -1789,7 +1875,7 @@ unsigned int count_swap_pages(int type, int free)
if (sis->flags & SWP_WRITEOK) {
n = sis->pages;
if (free)
- n -= sis->inuse_pages;
+ n -= swap_usage_in_pages(sis);
}
spin_unlock(&sis->lock);
}
@@ -2124,7 +2210,7 @@ static int try_to_unuse(unsigned int type)
swp_entry_t entry;
unsigned int i;
- if (!READ_ONCE(si->inuse_pages))
+ if (!swap_usage_in_pages(si))
goto success;
retry:
@@ -2137,7 +2223,7 @@ static int try_to_unuse(unsigned int type)
spin_lock(&mmlist_lock);
p = &init_mm.mmlist;
- while (READ_ONCE(si->inuse_pages) &&
+ while (swap_usage_in_pages(si) &&
!signal_pending(current) &&
(p = p->next) != &init_mm.mmlist) {
@@ -2165,7 +2251,7 @@ static int try_to_unuse(unsigned int type)
mmput(prev_mm);
i = 0;
- while (READ_ONCE(si->inuse_pages) &&
+ while (swap_usage_in_pages(si) &&
!signal_pending(current) &&
(i = find_next_to_unuse(si, i)) != 0) {
@@ -2200,7 +2286,7 @@ static int try_to_unuse(unsigned int type)
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
- if (READ_ONCE(si->inuse_pages)) {
+ if (swap_usage_in_pages(si)) {
if (!signal_pending(current))
goto retry;
return -EINTR;
@@ -2209,7 +2295,7 @@ static int try_to_unuse(unsigned int type)
success:
/*
* Make sure that further cleanups after try_to_unuse() returns happen
- * after swap_range_free() reduces si->inuse_pages to 0.
+ * after swap_range_free() reduces inuse_pages to 0.
*/
smp_mb();
return 0;
@@ -2227,7 +2313,7 @@ static void drain_mmlist(void)
unsigned int type;
for (type = 0; type < nr_swapfiles; type++)
- if (swap_info[type]->inuse_pages)
+ if (swap_usage_in_pages(swap_info[type]))
return;
spin_lock(&mmlist_lock);
list_for_each_safe(p, next, &init_mm.mmlist)
@@ -2406,7 +2492,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
static void _enable_swap_info(struct swap_info_struct *si)
{
- si->flags |= SWP_WRITEOK;
atomic_long_add(si->pages, &nr_swap_pages);
total_swap_pages += si->pages;
@@ -2423,9 +2508,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
*/
plist_add(&si->list, &swap_active_head);
- /* add to available list if swap device is not full */
- if (si->inuse_pages < si->pages)
- add_to_avail_list(si);
+ /* Add back to available list */
+ add_to_avail_list(si, true);
}
static void enable_swap_info(struct swap_info_struct *si, int prio,
@@ -2523,7 +2607,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out_dput;
}
spin_lock(&p->lock);
- del_from_avail_list(p);
+ del_from_avail_list(p, true);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
@@ -2541,7 +2625,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
- p->flags &= ~SWP_WRITEOK;
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
@@ -2721,7 +2804,7 @@ static int swap_show(struct seq_file *swap, void *v)
}
bytes = K(si->pages);
- inuse = K(READ_ONCE(si->inuse_pages));
+ inuse = K(swap_usage_in_pages(si));
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
@@ -2838,6 +2921,7 @@ static struct swap_info_struct *alloc_swap_info(void)
}
spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock);
+ atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
init_completion(&p->comp);
return p;
@@ -3335,7 +3419,7 @@ void si_swapinfo(struct sysinfo *val)
struct swap_info_struct *si = swap_info[type];
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
- nr_to_be_unused += READ_ONCE(si->inuse_pages);
+ nr_to_be_unused += swap_usage_in_pages(si);
}
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;