diff mbox series

[v2] zsmalloc: Fix address alignment in zspage for performance improvement

Message ID 20250129082825.3760555-1-chao.shun.cheng.tw@gmail.com (mailing list archive)
State New
Headers show
Series [v2] zsmalloc: Fix address alignment in zspage for performance improvement | expand

Commit Message

Kenny Cheng Jan. 29, 2025, 8:28 a.m. UTC
The zspage consists of multiple objects, each containing a "link struct"
to connect to the next object. The "link struct" is placed at the
beginning of each object. On a 32-bit system, the size of the "link
struct" is 4 bytes, which means the address returned by `zs_map_object`
is always 4-byte aligned.

For better performance, zram compression/decompression is offloaded to
hardware designed by the IC vendor. For example, Realtek's hardware
requires 16-byte alignment. However, due to the 4-byte alignment,
a `memcpy` operation is needed to move data from the 4-byte aligned
address to the 16-byte aligned address, which negatively impacts zram
performance.

This patch places "link struct" in the tail of the object that ensures
the address returned by `zs_map_object` is aligned to the class size
and DELTA, thereby improving zram performance by eliminating unnecessary
memory copying.

Signed-off-by: Kenny Cheng <chao.shun.cheng.tw@gmail.com>
---
V1 -> V2: Fixed compiler errors

mm/zsmalloc.c | 69 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 16 deletions(-)

Comments

Sergey Senozhatsky Jan. 29, 2025, 11:45 a.m. UTC | #1
Cc-ing Andrew and Yosry

On (25/01/29 16:28), Kenny Cheng wrote:
> The zspage consists of multiple objects, each containing a "link struct"
> to connect to the next object. The "link struct" is placed at the
> beginning of each object. On a 32-bit system, the size of the "link
> struct" is 4 bytes, which means the address returned by `zs_map_object`
> is always 4-byte aligned.

Sorry, you picked a rather unlucky time to patch zsmalloc, as it
goes through some intrusive changes.  We need to stabilize those
changes first.

> For better performance, zram compression/decompression is offloaded to
> hardware designed by the IC vendor. For example, Realtek's hardware
> requires 16-byte alignment. However, due to the 4-byte alignment,
> a `memcpy` operation is needed to move data from the 4-byte aligned
> address to the 16-byte aligned address, which negatively impacts zram
> performance.

Hmm, I don't know.  If we change zsmalloc to make some H/W happy, how
do we make sure that something that is good for Realtek is not "bad"
for some other H/W?

I'm very unsure about having "vendor-specific" (by the way, is that
out-of-tree compression/decompression driver?) changes in zsmalloc.
Kenny Cheng Jan. 29, 2025, 1:17 p.m. UTC | #2
> Hmm, I don't know.  If we change zsmalloc to make some H/W happy, how
> do we make sure that something that is good for Realtek is not "bad"
> for some other H/W?

No, this patch would not have any impact on other HW. This patch only
changes the position of "link struct" from head to tail in an object.

Example:
A 32-bit system, using a zspage with a class size of 32 bytes and a 4K
page system.

In this case, the address is obtained from zs_map_object if zsmalloc is
used with this zspage.

Without this patch: The address is aligned to 4 bytes.
The alignment size is always the same as the size of the "link struct."
With this patch: The address is aligned to 32 bytes.
The alignment size can be controlled by the class size.

For the SW zram algorithm, this patch has no effect. For the HW zram
algorithm, which requires 4-byte alignment, this patch has no effect
either.

However, for the HW zram algorithm that requires more than 4-byte
alignment, this patch can improve zram performance if the class size is
aligned with the required size.

> I'm very unsure about having "vendor-specific" (by the way, is that
> out-of-tree compression/decompression driver?) changes in zsmalloc.

Yes, it is an out-of-tree driver. It is common compression/decompression
algorithm such as lz4, zstd, and so on. The only difference is that the
implementation is based on in-house designed HW.
diff mbox series

Patch

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 817626a351f8..b6436e109fd8 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -800,7 +800,7 @@  static unsigned long handle_to_obj(unsigned long handle)
 	return *(unsigned long *)handle;
 }
 
-static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj,
+static inline bool obj_allocated(struct zpdesc *zpdesc, void *link,
 				 unsigned long *phandle)
 {
 	unsigned long handle;
@@ -810,7 +810,7 @@  static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj,
 		VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc));
 		handle = zpdesc->handle;
 	} else
-		handle = *(unsigned long *)obj;
+		handle = *(unsigned long *)link;
 
 	if (!(handle & OBJ_ALLOCATED_TAG))
 		return false;
@@ -911,7 +911,12 @@  static void init_zspage(struct size_class *class, struct zspage *zspage)
 		struct link_free *link;
 		void *vaddr;
 
-		set_first_obj_offset(zpdesc, off);
+		if (is_first_zpdesc(zpdesc)) {
+			set_first_obj_offset(zpdesc, off);
+			off = class->size - ZS_HANDLE_SIZE;
+		} else {
+			set_first_obj_offset(zpdesc, off + ZS_HANDLE_SIZE);
+		}
 
 		vaddr = kmap_local_zpdesc(zpdesc);
 		link = (struct link_free *)vaddr + off / sizeof(*link);
@@ -1095,7 +1100,7 @@  static void __zs_unmap_object(struct mapping_area *area,
 	off += ZS_HANDLE_SIZE;
 
 	sizes[0] = PAGE_SIZE - off;
-	sizes[1] = size - sizes[0];
+	sizes[1] = size - sizes[0] - ZS_HANDLE_SIZE;
 
 	/* copy per-cpu buffer to object */
 	memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]);
@@ -1240,9 +1245,6 @@  void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 
 	ret = __zs_map_object(area, zpdescs, off, class->size);
 out:
-	if (likely(!ZsHugePage(zspage)))
-		ret += ZS_HANDLE_SIZE;
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(zs_map_object);
@@ -1303,28 +1305,35 @@  EXPORT_SYMBOL_GPL(zs_huge_class_size);
 static unsigned long obj_malloc(struct zs_pool *pool,
 				struct zspage *zspage, unsigned long handle)
 {
-	int i, nr_zpdesc, offset;
+	int i, nr_obj_zpdesc, nr_link_zpdesc;
 	unsigned long obj;
 	struct link_free *link;
 	struct size_class *class;
 
 	struct zpdesc *m_zpdesc;
-	unsigned long m_offset;
+	unsigned long obj_off;
+	unsigned long link_off;
 	void *vaddr;
 
 	class = pool->size_class[zspage->class];
 	obj = get_freeobj(zspage);
 
-	offset = obj * class->size;
-	nr_zpdesc = offset >> PAGE_SHIFT;
-	m_offset = offset_in_page(offset);
+	obj_off = obj * class->size;
+	link_off = obj_off + class->size - ZS_HANDLE_SIZE;
+	nr_obj_zpdesc = obj_off >> PAGE_SHIFT;
+	nr_link_zpdesc = link_off >> PAGE_SHIFT;
+	link_off = offset_in_page(link_off);
 	m_zpdesc = get_first_zpdesc(zspage);
 
-	for (i = 0; i < nr_zpdesc; i++)
+	for (i = 0; i < nr_obj_zpdesc; i++)
 		m_zpdesc = get_next_zpdesc(m_zpdesc);
 
-	vaddr = kmap_local_zpdesc(m_zpdesc);
-	link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+	if (nr_obj_zpdesc == nr_link_zpdesc)
+		vaddr = kmap_local_zpdesc(m_zpdesc);
+	else
+		vaddr = kmap_local_zpdesc(get_next_zpdesc(m_zpdesc));
+
+	link = (struct link_free *)vaddr + link_off / sizeof(*link);
 	set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
 	if (likely(!ZsHugePage(zspage)))
 		/* record handle in the header of allocated chunk */
@@ -1422,6 +1431,11 @@  static void obj_free(int class_size, unsigned long obj)
 
 	obj_to_location(obj, &f_zpdesc, &f_objidx);
 	f_offset = offset_in_page(class_size * f_objidx);
+	f_offset += class_size - ZS_HANDLE_SIZE;
+	if (f_offset >= PAGE_SIZE) {
+		f_zpdesc = get_next_zpdesc(f_zpdesc);
+		f_offset = offset_in_page(f_offset);
+	}
 	zspage = get_zspage(f_zpdesc);
 
 	vaddr = kmap_local_zpdesc(f_zpdesc);
@@ -1556,6 +1570,12 @@  static unsigned long find_alloced_obj(struct size_class *class,
 	void *addr = kmap_local_zpdesc(zpdesc);
 
 	offset = get_first_obj_offset(zpdesc);
+
+	if (is_first_zpdesc(zpdesc))
+		offset += class->size - ZS_HANDLE_SIZE;
+	else
+		offset -= ZS_HANDLE_SIZE;
+
 	offset += class->size * index;
 
 	while (offset < PAGE_SIZE) {
@@ -1806,7 +1826,7 @@  static int zs_page_migrate(struct page *newpage, struct page *page,
 	/* the migrate_write_lock protects zpage access via zs_map_object */
 	migrate_write_lock(zspage);
 
-	offset = get_first_obj_offset(zpdesc);
+	offset = get_first_obj_offset(zpdesc) + class->size - ZS_HANDLE_SIZE;
 	s_addr = kmap_local_zpdesc(zpdesc);
 
 	/*
@@ -1828,6 +1848,23 @@  static int zs_page_migrate(struct page *newpage, struct page *page,
 	}
 	kunmap_local(s_addr);
 
+	/*
+	 * Don't forget to check if there is an obj's link on the next page.
+	 * If there is, it also needs to be handled.
+	 */
+	if (offset_in_page(addr) && get_next_zpdesc(zpdesc)) {
+		s_addr = kmap_local_zpdesc(get_next_zpdesc(zpdesc));
+		addr = s_addr + offset_in_page(addr);
+		if (obj_allocated(zpdesc, addr, &handle)) {
+
+			old_obj = handle_to_obj(handle);
+			obj_to_location(old_obj, &dummy, &obj_idx);
+			new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx);
+			record_obj(handle, new_obj);
+		}
+		kunmap_local(s_addr);
+	}
+
 	replace_sub_page(class, zspage, newzpdesc, zpdesc);
 	/*
 	 * Since we complete the data copy and set up new zspage structure,