@@ -284,12 +284,10 @@ enum swap_cluster_flags {
#endif
/*
- * We assign a cluster to each CPU, so each CPU can allocate swap entry from
- * its own cluster and swapout sequentially. The purpose is to optimize swapout
- * throughput.
+ * We keep using same cluster for rotational device so IO will be sequential.
+ * The purpose is to optimize SWAP throughput on these device.
*/
-struct percpu_cluster {
- local_lock_t lock; /* Protect the percpu_cluster above */
+struct swap_sequential_cluster {
unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};
@@ -315,8 +313,7 @@ struct swap_info_struct {
atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
unsigned int pages; /* total of usable pages of swap */
atomic_long_t inuse_pages; /* number of those currently in use */
- struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
- struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
+ struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
spinlock_t global_cluster_lock; /* Serialize usage of global cluster */
struct rb_root swap_extent_root;/* root of the swap extent rbtree */
struct block_device *bdev; /* swap device or bdev of swap file */
@@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+struct percpu_swap_cluster {
+ struct swap_info_struct *si[SWAP_NR_ORDERS];
+ unsigned long offset[SWAP_NR_ORDERS];
+ local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
+ .si = { NULL },
+ .offset = { SWAP_ENTRY_INVALID },
+ .lock = INIT_LOCAL_LOCK(),
+};
+
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
if (type >= MAX_SWAPFILES)
@@ -539,7 +551,7 @@ static bool swap_do_scheduled_discard(struct swap_info_struct *si)
ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
/*
* Delete the cluster from list to prepare for discard, but keep
- * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
+ * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
* pointing to it, or ran into by relocate_cluster.
*/
list_del(&ci->list);
@@ -805,10 +817,12 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
out:
relocate_cluster(si, ci);
unlock_cluster(ci);
- if (si->flags & SWP_SOLIDSTATE)
- __this_cpu_write(si->percpu_cluster->next[order], next);
- else
+ if (si->flags & SWP_SOLIDSTATE) {
+ __this_cpu_write(percpu_swap_cluster.si[order], si);
+ __this_cpu_write(percpu_swap_cluster.offset[order], next);
+ } else {
si->global_cluster->next[order] = next;
+ }
return found;
}
@@ -862,9 +876,8 @@ static void swap_reclaim_work(struct work_struct *work)
}
/*
- * Try to get swap entries with specified order from current cpu's swap entry
- * pool (a cluster). This might involve allocating a new cluster for current CPU
- * too.
+ * Try to allocate swap entries with specified order and try set a new
+ * cluster for current CPU too.
*/
static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
unsigned char usage)
@@ -872,18 +885,12 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
struct swap_cluster_info *ci;
unsigned int offset, found = 0;
- if (si->flags & SWP_SOLIDSTATE) {
- /* Fast path using per CPU cluster */
- local_lock(&si->percpu_cluster->lock);
- offset = __this_cpu_read(si->percpu_cluster->next[order]);
- } else {
+ if (!(si->flags & SWP_SOLIDSTATE)) {
/* Serialize HDD SWAP allocation for each device. */
spin_lock(&si->global_cluster_lock);
offset = si->global_cluster->next[order];
- }
-
- if (offset) {
ci = lock_cluster(si, offset);
+
/* Cluster could have been used by another order */
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
@@ -973,9 +980,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
}
}
done:
- if (si->flags & SWP_SOLIDSTATE)
- local_unlock(&si->percpu_cluster->lock);
- else
+ if (!(si->flags & SWP_SOLIDSTATE))
spin_unlock(&si->global_cluster_lock);
return found;
}
@@ -1196,6 +1201,49 @@ static bool get_swap_device_info(struct swap_info_struct *si)
return true;
}
+/*
+ * Fast path try to get swap entries with specified order from current
+ * CPU's swap entry pool (a cluster).
+ */
+static int swap_alloc_fast(swp_entry_t entries[],
+ unsigned char usage,
+ int order, int n_goal)
+{
+ struct swap_cluster_info *ci;
+ struct swap_info_struct *si;
+ unsigned int offset, found;
+ int n_ret = 0;
+
+ n_goal = min(n_goal, SWAP_BATCH);
+
+ /*
+ * Once allocated, swap_info_struct will never be completely freed,
+ * so checking it's liveness by get_swap_device_info is enough.
+ */
+ si = __this_cpu_read(percpu_swap_cluster.si[order]);
+ offset = __this_cpu_read(percpu_swap_cluster.offset[order]);
+ if (!si || !offset || !get_swap_device_info(si))
+ return 0;
+
+ while (offset) {
+ ci = lock_cluster(si, offset);
+ if (!cluster_is_usable(ci, order))
+ break;
+ if (cluster_is_empty(ci))
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+ if (!found)
+ break;
+ entries[n_ret++] = swp_entry(si->type, found);
+ if (n_ret == n_goal)
+ break;
+ offset = __this_cpu_read(percpu_swap_cluster.offset[order]);
+ }
+
+ put_swap_device(si);
+ return n_ret;
+}
+
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
{
int order = swap_entry_order(entry_order);
@@ -1204,19 +1252,36 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
int n_ret = 0;
int node;
+ /* Fast path using percpu cluster */
+ local_lock(&percpu_swap_cluster.lock);
+ n_ret = swap_alloc_fast(swp_entries,
+ SWAP_HAS_CACHE,
+ order, n_goal);
+ if (n_ret == n_goal)
+ goto out;
+
+ n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
+ /* Rotate the device and switch to a new cluster */
spin_lock(&swap_avail_lock);
start_over:
node = numa_node_id();
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
- /* requeue si to after same-priority siblings */
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
- n_goal, swp_entries, order);
+ /*
+ * For order 0 allocation, try best to fill the request
+ * as it's used by slot cache.
+ *
+ * For mTHP allocation, it always have n_goal == 1,
+ * and falling a mTHP swapin will just make the caller
+ * fallback to order 0 allocation, so just bail out.
+ */
+ n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
+ swp_entries + n_ret, order);
put_swap_device(si);
if (n_ret || size > 1)
- goto check_out;
+ goto out;
}
spin_lock(&swap_avail_lock);
@@ -1234,12 +1299,10 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
}
-
spin_unlock(&swap_avail_lock);
-
-check_out:
+out:
+ local_unlock(&percpu_swap_cluster.lock);
atomic_long_sub(n_ret * size, &nr_swap_pages);
-
return n_ret;
}
@@ -2725,8 +2788,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
arch_swap_invalidate_area(p->type);
zswap_swapoff(p->type);
mutex_unlock(&swapon_mutex);
- free_percpu(p->percpu_cluster);
- p->percpu_cluster = NULL;
kfree(p->global_cluster);
p->global_cluster = NULL;
vfree(swap_map);
@@ -3125,7 +3186,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
struct swap_cluster_info *cluster_info;
unsigned long i, j, idx;
- int cpu, err = -ENOMEM;
+ int err = -ENOMEM;
cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
@@ -3134,20 +3195,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- if (si->flags & SWP_SOLIDSTATE) {
- si->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!si->percpu_cluster)
- goto err_free;
-
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
-
- cluster = per_cpu_ptr(si->percpu_cluster, cpu);
- for (i = 0; i < SWAP_NR_ORDERS; i++)
- cluster->next[i] = SWAP_ENTRY_INVALID;
- local_lock_init(&cluster->lock);
- }
- } else {
+ if (!(si->flags & SWP_SOLIDSTATE)) {
si->global_cluster = kmalloc(sizeof(*si->global_cluster),
GFP_KERNEL);
if (!si->global_cluster)
@@ -3424,8 +3472,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
- free_percpu(si->percpu_cluster);
- si->percpu_cluster = NULL;
kfree(si->global_cluster);
si->global_cluster = NULL;
inode = NULL;