diff mbox series

[1/7] zram: switch to non-atomic entry locking

Message ID 20250122055831.3341175-2-senozhatsky@chromium.org (mailing list archive)
State New
Headers show
Series zram: preemptible writes and occasionally preemptible reads | expand

Commit Message

Sergey Senozhatsky Jan. 22, 2025, 5:57 a.m. UTC
Concurrent modifications of meta table entries is now handled
by per-entry spin-lock.  This has a number of shortcomings.

First, this imposes atomic requirements on compression backends.
zram can call both zcomp_compress() and zcomp_decompress() under
entry spin-lock, which implies that we can use only compression
algorithms that don't schedule/sleep/wait during compression and
decompression.  This, for instance, makes it impossible to use
some of the ASYNC compression algorithms (H/W compression, etc.)
implementations.

Second, this can potentially trigger watchdogs.  For example,
entry re-compression with secondary algorithms is performed
under entry spin-lock.  Given that we chain secondary
compression algorithms and that some of them can be configured
for best compression ratio (and worst compression speed) zram
can stay under spin-lock for quite some time.

Do not use per-entry spin-locks and instead switch to a bucket
RW-sem locking scheme.  Each rw-lock controls access to a number
of zram entries (a bucket).  Each bucket is configured to protect
12 (for the time being) pages of zram disk, in order to minimize
memory footprint of bucket locks - we cannot afford a mutex of
rwsem per-entry, that can use hundreds of megabytes on a relatively
common setup, yet we still want some degree of parallelism wrt
entries access.

Bucket lock is taken in write-mode only to modify zram entry,
compression and zsmalloc handle allocation performed outside of
bucket lock scopr; decompression is performed under bucket
read-lock.  At a glance there doesn't seem to be any immediate
performance difference:

time make -j$(nproc) ARCH=x86 all modules # defconfig, 24-vCPU VM

BEFORE
------
Kernel: arch/x86/boot/bzImage is ready  (#1)
1371.43user 158.84system 1:30.70elapsed 1687%CPU (0avgtext+0avgdata 825620maxresident)k
32504inputs+1768352outputs (259major+51536895minor)pagefaults 0swaps

AFTER
-----
Kernel: arch/x86/boot/bzImage is ready  (#1)
1366.79user 158.82system 1:31.17elapsed 1673%CPU (0avgtext+0avgdata 825676maxresident)k
32504inputs+1768352outputs (263major+51538123minor)pagefaults 0swaps

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 drivers/block/zram/zram_drv.c | 151 ++++++++++++++++++++--------------
 drivers/block/zram/zram_drv.h |   8 +-
 2 files changed, 98 insertions(+), 61 deletions(-)
diff mbox series

Patch

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9f5020b077c5..7eb7feba3cac 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -58,19 +58,44 @@  static void zram_free_page(struct zram *zram, size_t index);
 static int zram_read_from_zspool(struct zram *zram, struct page *page,
 				 u32 index);
 
-static int zram_slot_trylock(struct zram *zram, u32 index)
+static u32 zram_bucket_idx(u32 index)
 {
-	return spin_trylock(&zram->table[index].lock);
+	return index / ZRAM_PAGES_PER_BUCKET_LOCK;
 }
 
-static void zram_slot_lock(struct zram *zram, u32 index)
+static int zram_slot_write_trylock(struct zram *zram, u32 index)
 {
-	spin_lock(&zram->table[index].lock);
+	u32 idx = zram_bucket_idx(index);
+
+	return down_write_trylock(&zram->locks[idx].lock);
+}
+
+static void zram_slot_write_lock(struct zram *zram, u32 index)
+{
+	u32 idx = zram_bucket_idx(index);
+
+	down_write(&zram->locks[idx].lock);
+}
+
+static void zram_slot_write_unlock(struct zram *zram, u32 index)
+{
+	u32 idx = zram_bucket_idx(index);
+
+	up_write(&zram->locks[idx].lock);
+}
+
+static void zram_slot_read_lock(struct zram *zram, u32 index)
+{
+	u32 idx = zram_bucket_idx(index);
+
+	down_read(&zram->locks[idx].lock);
 }
 
-static void zram_slot_unlock(struct zram *zram, u32 index)
+static void zram_slot_read_unlock(struct zram *zram, u32 index)
 {
-	spin_unlock(&zram->table[index].lock);
+	u32 idx = zram_bucket_idx(index);
+
+	up_read(&zram->locks[idx].lock);
 }
 
 static inline bool init_done(struct zram *zram)
@@ -93,7 +118,6 @@  static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
 	zram->table[index].handle = handle;
 }
 
-/* flag operations require table entry bit_spin_lock() being held */
 static bool zram_test_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
@@ -229,9 +253,9 @@  static void release_pp_slot(struct zram *zram, struct zram_pp_slot *pps)
 {
 	list_del_init(&pps->entry);
 
-	zram_slot_lock(zram, pps->index);
+	zram_slot_write_lock(zram, pps->index);
 	zram_clear_flag(zram, pps->index, ZRAM_PP_SLOT);
-	zram_slot_unlock(zram, pps->index);
+	zram_slot_write_unlock(zram, pps->index);
 
 	kfree(pps);
 }
@@ -394,11 +418,11 @@  static void mark_idle(struct zram *zram, ktime_t cutoff)
 		 *
 		 * And ZRAM_WB slots simply cannot be ZRAM_IDLE.
 		 */
-		zram_slot_lock(zram, index);
+		zram_slot_write_lock(zram, index);
 		if (!zram_allocated(zram, index) ||
 		    zram_test_flag(zram, index, ZRAM_WB) ||
 		    zram_test_flag(zram, index, ZRAM_SAME)) {
-			zram_slot_unlock(zram, index);
+			zram_slot_write_unlock(zram, index);
 			continue;
 		}
 
@@ -410,7 +434,7 @@  static void mark_idle(struct zram *zram, ktime_t cutoff)
 			zram_set_flag(zram, index, ZRAM_IDLE);
 		else
 			zram_clear_flag(zram, index, ZRAM_IDLE);
-		zram_slot_unlock(zram, index);
+		zram_slot_write_unlock(zram, index);
 	}
 }
 
@@ -709,7 +733,7 @@  static int scan_slots_for_writeback(struct zram *zram, u32 mode,
 
 		INIT_LIST_HEAD(&pps->entry);
 
-		zram_slot_lock(zram, index);
+		zram_slot_write_lock(zram, index);
 		if (!zram_allocated(zram, index))
 			goto next;
 
@@ -731,7 +755,7 @@  static int scan_slots_for_writeback(struct zram *zram, u32 mode,
 		place_pp_slot(zram, ctl, pps);
 		pps = NULL;
 next:
-		zram_slot_unlock(zram, index);
+		zram_slot_write_unlock(zram, index);
 	}
 
 	kfree(pps);
@@ -822,7 +846,7 @@  static ssize_t writeback_store(struct device *dev,
 		}
 
 		index = pps->index;
-		zram_slot_lock(zram, index);
+		zram_slot_read_lock(zram, index);
 		/*
 		 * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
 		 * slots can change in the meantime. If slots are accessed or
@@ -833,7 +857,7 @@  static ssize_t writeback_store(struct device *dev,
 			goto next;
 		if (zram_read_from_zspool(zram, page, index))
 			goto next;
-		zram_slot_unlock(zram, index);
+		zram_slot_read_unlock(zram, index);
 
 		bio_init(&bio, zram->bdev, &bio_vec, 1,
 			 REQ_OP_WRITE | REQ_SYNC);
@@ -860,7 +884,7 @@  static ssize_t writeback_store(struct device *dev,
 		}
 
 		atomic64_inc(&zram->stats.bd_writes);
-		zram_slot_lock(zram, index);
+		zram_slot_write_lock(zram, index);
 		/*
 		 * Same as above, we release slot lock during writeback so
 		 * slot can change under us: slot_free() or slot_free() and
@@ -882,7 +906,7 @@  static ssize_t writeback_store(struct device *dev,
 			zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
 		spin_unlock(&zram->wb_limit_lock);
 next:
-		zram_slot_unlock(zram, index);
+		zram_slot_write_unlock(zram, index);
 		release_pp_slot(zram, pps);
 
 		cond_resched();
@@ -1001,7 +1025,7 @@  static ssize_t read_block_state(struct file *file, char __user *buf,
 	for (index = *ppos; index < nr_pages; index++) {
 		int copied;
 
-		zram_slot_lock(zram, index);
+		zram_slot_read_lock(zram, index);
 		if (!zram_allocated(zram, index))
 			goto next;
 
@@ -1019,13 +1043,13 @@  static ssize_t read_block_state(struct file *file, char __user *buf,
 				       ZRAM_INCOMPRESSIBLE) ? 'n' : '.');
 
 		if (count <= copied) {
-			zram_slot_unlock(zram, index);
+			zram_slot_read_unlock(zram, index);
 			break;
 		}
 		written += copied;
 		count -= copied;
 next:
-		zram_slot_unlock(zram, index);
+		zram_slot_read_unlock(zram, index);
 		*ppos += 1;
 	}
 
@@ -1451,37 +1475,44 @@  static void zram_meta_free(struct zram *zram, u64 disksize)
 	zs_destroy_pool(zram->mem_pool);
 	vfree(zram->table);
 	zram->table = NULL;
+	vfree(zram->locks);
+	zram->locks = NULL;
 }
 
 static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 {
-	size_t num_pages, index;
+	size_t num_ents, index;
 
-	num_pages = disksize >> PAGE_SHIFT;
-	zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
+	num_ents = disksize >> PAGE_SHIFT;
+	zram->table = vzalloc(array_size(num_ents, sizeof(*zram->table)));
 	if (!zram->table)
-		return false;
+		goto error;
+
+	num_ents /= ZRAM_PAGES_PER_BUCKET_LOCK;
+	zram->locks = vzalloc(array_size(num_ents, sizeof(*zram->locks)));
+	if (!zram->locks)
+		goto error;
 
 	zram->mem_pool = zs_create_pool(zram->disk->disk_name);
-	if (!zram->mem_pool) {
-		vfree(zram->table);
-		zram->table = NULL;
-		return false;
-	}
+	if (!zram->mem_pool)
+		goto error;
 
 	if (!huge_class_size)
 		huge_class_size = zs_huge_class_size(zram->mem_pool);
 
-	for (index = 0; index < num_pages; index++)
-		spin_lock_init(&zram->table[index].lock);
+	for (index = 0; index < num_ents; index++)
+		init_rwsem(&zram->locks[index].lock);
+
 	return true;
+
+error:
+	vfree(zram->table);
+	zram->table = NULL;
+	vfree(zram->locks);
+	zram->locks = NULL;
+	return false;
 }
 
-/*
- * To protect concurrent access to the same index entry,
- * caller should hold this table index entry's bit_spinlock to
- * indicate this index entry is accessing.
- */
 static void zram_free_page(struct zram *zram, size_t index)
 {
 	unsigned long handle;
@@ -1602,17 +1633,17 @@  static int zram_read_page(struct zram *zram, struct page *page, u32 index,
 {
 	int ret;
 
-	zram_slot_lock(zram, index);
+	zram_slot_read_lock(zram, index);
 	if (!zram_test_flag(zram, index, ZRAM_WB)) {
 		/* Slot should be locked through out the function call */
 		ret = zram_read_from_zspool(zram, page, index);
-		zram_slot_unlock(zram, index);
+		zram_slot_read_unlock(zram, index);
 	} else {
 		/*
 		 * The slot should be unlocked before reading from the backing
 		 * device.
 		 */
-		zram_slot_unlock(zram, index);
+		zram_slot_read_unlock(zram, index);
 
 		ret = read_from_bdev(zram, page, zram_get_handle(zram, index),
 				     parent);
@@ -1655,10 +1686,10 @@  static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
 static int write_same_filled_page(struct zram *zram, unsigned long fill,
 				  u32 index)
 {
-	zram_slot_lock(zram, index);
+	zram_slot_write_lock(zram, index);
 	zram_set_flag(zram, index, ZRAM_SAME);
 	zram_set_handle(zram, index, fill);
-	zram_slot_unlock(zram, index);
+	zram_slot_write_unlock(zram, index);
 
 	atomic64_inc(&zram->stats.same_pages);
 	atomic64_inc(&zram->stats.pages_stored);
@@ -1693,11 +1724,11 @@  static int write_incompressible_page(struct zram *zram, struct page *page,
 	kunmap_local(src);
 	zs_unmap_object(zram->mem_pool, handle);
 
-	zram_slot_lock(zram, index);
+	zram_slot_write_lock(zram, index);
 	zram_set_flag(zram, index, ZRAM_HUGE);
 	zram_set_handle(zram, index, handle);
 	zram_set_obj_size(zram, index, PAGE_SIZE);
-	zram_slot_unlock(zram, index);
+	zram_slot_write_unlock(zram, index);
 
 	atomic64_add(PAGE_SIZE, &zram->stats.compr_data_size);
 	atomic64_inc(&zram->stats.huge_pages);
@@ -1718,9 +1749,9 @@  static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 	bool same_filled;
 
 	/* First, free memory allocated to this slot (if any) */
-	zram_slot_lock(zram, index);
+	zram_slot_write_lock(zram, index);
 	zram_free_page(zram, index);
-	zram_slot_unlock(zram, index);
+	zram_slot_write_unlock(zram, index);
 
 	mem = kmap_local_page(page);
 	same_filled = page_same_filled(mem, &element);
@@ -1790,10 +1821,10 @@  static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 	zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
 	zs_unmap_object(zram->mem_pool, handle);
 
-	zram_slot_lock(zram, index);
+	zram_slot_write_lock(zram, index);
 	zram_set_handle(zram, index, handle);
 	zram_set_obj_size(zram, index, comp_len);
-	zram_slot_unlock(zram, index);
+	zram_slot_write_unlock(zram, index);
 
 	/* Update stats */
 	atomic64_inc(&zram->stats.pages_stored);
@@ -1850,7 +1881,7 @@  static int scan_slots_for_recompress(struct zram *zram, u32 mode,
 
 		INIT_LIST_HEAD(&pps->entry);
 
-		zram_slot_lock(zram, index);
+		zram_slot_write_lock(zram, index);
 		if (!zram_allocated(zram, index))
 			goto next;
 
@@ -1871,7 +1902,7 @@  static int scan_slots_for_recompress(struct zram *zram, u32 mode,
 		place_pp_slot(zram, ctl, pps);
 		pps = NULL;
 next:
-		zram_slot_unlock(zram, index);
+		zram_slot_write_unlock(zram, index);
 	}
 
 	kfree(pps);
@@ -2162,7 +2193,7 @@  static ssize_t recompress_store(struct device *dev,
 		if (!num_recomp_pages)
 			break;
 
-		zram_slot_lock(zram, pps->index);
+		zram_slot_write_lock(zram, pps->index);
 		if (!zram_test_flag(zram, pps->index, ZRAM_PP_SLOT))
 			goto next;
 
@@ -2170,7 +2201,7 @@  static ssize_t recompress_store(struct device *dev,
 				      &num_recomp_pages, threshold,
 				      prio, prio_max);
 next:
-		zram_slot_unlock(zram, pps->index);
+		zram_slot_write_unlock(zram, pps->index);
 		release_pp_slot(zram, pps);
 
 		if (err) {
@@ -2217,9 +2248,9 @@  static void zram_bio_discard(struct zram *zram, struct bio *bio)
 	}
 
 	while (n >= PAGE_SIZE) {
-		zram_slot_lock(zram, index);
+		zram_slot_write_lock(zram, index);
 		zram_free_page(zram, index);
-		zram_slot_unlock(zram, index);
+		zram_slot_write_unlock(zram, index);
 		atomic64_inc(&zram->stats.notify_free);
 		index++;
 		n -= PAGE_SIZE;
@@ -2248,9 +2279,9 @@  static void zram_bio_read(struct zram *zram, struct bio *bio)
 		}
 		flush_dcache_page(bv.bv_page);
 
-		zram_slot_lock(zram, index);
+		zram_slot_write_lock(zram, index);
 		zram_accessed(zram, index);
-		zram_slot_unlock(zram, index);
+		zram_slot_write_unlock(zram, index);
 
 		bio_advance_iter_single(bio, &iter, bv.bv_len);
 	} while (iter.bi_size);
@@ -2278,9 +2309,9 @@  static void zram_bio_write(struct zram *zram, struct bio *bio)
 			break;
 		}
 
-		zram_slot_lock(zram, index);
+		zram_slot_write_lock(zram, index);
 		zram_accessed(zram, index);
-		zram_slot_unlock(zram, index);
+		zram_slot_write_unlock(zram, index);
 
 		bio_advance_iter_single(bio, &iter, bv.bv_len);
 	} while (iter.bi_size);
@@ -2321,13 +2352,13 @@  static void zram_slot_free_notify(struct block_device *bdev,
 	zram = bdev->bd_disk->private_data;
 
 	atomic64_inc(&zram->stats.notify_free);
-	if (!zram_slot_trylock(zram, index)) {
+	if (!zram_slot_write_trylock(zram, index)) {
 		atomic64_inc(&zram->stats.miss_free);
 		return;
 	}
 
 	zram_free_page(zram, index);
-	zram_slot_unlock(zram, index);
+	zram_slot_write_unlock(zram, index);
 }
 
 static void zram_comp_params_reset(struct zram *zram)
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index db78d7c01b9a..b272ede404b0 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -28,6 +28,7 @@ 
 #define ZRAM_SECTOR_PER_LOGICAL_BLOCK	\
 	(1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))
 
+#define ZRAM_PAGES_PER_BUCKET_LOCK	12
 
 /*
  * ZRAM is mainly used for memory efficiency so we want to keep memory
@@ -63,13 +64,17 @@  enum zram_pageflags {
 /* Allocated for each disk page */
 struct zram_table_entry {
 	unsigned long handle;
+	/* u32 suffice for flags + u32 padding */
 	unsigned int flags;
-	spinlock_t lock;
 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
 	ktime_t ac_time;
 #endif
 };
 
+struct zram_bucket_lock {
+	struct rw_semaphore lock;
+};
+
 struct zram_stats {
 	atomic64_t compr_data_size;	/* compressed size of pages stored */
 	atomic64_t failed_reads;	/* can happen when memory is too low */
@@ -101,6 +106,7 @@  struct zram_stats {
 
 struct zram {
 	struct zram_table_entry *table;
+	struct zram_bucket_lock *locks;
 	struct zs_pool *mem_pool;
 	struct zcomp *comps[ZRAM_MAX_COMPS];
 	struct zcomp_params params[ZRAM_MAX_COMPS];