diff mbox series

btrfs: zlib: refactor S390x HW acceleration buffer preparation

Message ID bee6ac5945df3db876a553dbf6d0dd546f145aa7.1737701962.git.wqu@suse.com (mailing list archive)
State New
Headers show
Series btrfs: zlib: refactor S390x HW acceleration buffer preparation | expand

Commit Message

Qu Wenruo Jan. 24, 2025, 6:59 a.m. UTC
Currently for s390x HW zlib compression, to get the best performance we
need a buffer size which is larger than a page.

This means we need to copy multiple pages into workspace->buf, then use
that buffer as zlib compression input.

Currently it's hardcoded using page sized folio, and all the handling
are deep inside a loop.

Refactor the code by:

- Introduce a dedicated helper to do the buffer copy
  The new helper will be called copy_data_into_buffer().

- Add extra ASSERT()s
  * Make sure we only go into the function for hardware acceleration
  * Make sure we still get page sized folio

- Prepare for future larger folios
  This means we will rely on the folio size, other than PAGE_SIZE to do
  the copy.

- Handle the folio mapping and unmapping inside the helper function
  For S390x hardware acceleration case, it never utilize the @data_in
  pointer, thus we can do folio mapping/unmapping all inside the function.

Cc: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
Hi Mikhail,

Mind to give this patch a test on S390x? As I do not have easy access to
such system.

And I hope I won't need to bother you again when we enable larger folios
for btrfs in the near future.

Thanks,
Qu
---
 fs/btrfs/zlib.c | 82 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 28 deletions(-)

Comments

Zaslonko Mikhail Feb. 4, 2025, 9:55 p.m. UTC | #1
Hello Qu,

Sorry for the long silence.
I've tested this patch on s390 using our hardware compression test suite. 
No issues discovered.

Here is my
Acked-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>
Tested-by: Mikhail Zaslonko <zaslonko@linux.ibm.com>


On 24.01.2025 07:59, Qu Wenruo wrote:
> Currently for s390x HW zlib compression, to get the best performance we
> need a buffer size which is larger than a page.
> 
> This means we need to copy multiple pages into workspace->buf, then use
> that buffer as zlib compression input.
> 
> Currently it's hardcoded using page sized folio, and all the handling
> are deep inside a loop.
> 
> Refactor the code by:
> 
> - Introduce a dedicated helper to do the buffer copy
>   The new helper will be called copy_data_into_buffer().
> 
> - Add extra ASSERT()s
>   * Make sure we only go into the function for hardware acceleration
>   * Make sure we still get page sized folio
> 
> - Prepare for future larger folios
>   This means we will rely on the folio size, other than PAGE_SIZE to do
>   the copy.
> 
> - Handle the folio mapping and unmapping inside the helper function
>   For S390x hardware acceleration case, it never utilize the @data_in
>   pointer, thus we can do folio mapping/unmapping all inside the function.
> 
> Cc: Mikhail Zaslonko <zaslonko@linux.ibm.com>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Hi Mikhail,
> 
> Mind to give this patch a test on S390x? As I do not have easy access to
> such system.
> 
> And I hope I won't need to bother you again when we enable larger folios
> for btrfs in the near future.
> 
> Thanks,
> Qu
> ---
>  fs/btrfs/zlib.c | 82 ++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 54 insertions(+), 28 deletions(-)
> 
> diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
> index c9e92c6941ec..7d81d9a0b3a0 100644
> --- a/fs/btrfs/zlib.c
> +++ b/fs/btrfs/zlib.c
> @@ -94,6 +94,47 @@ struct list_head *zlib_alloc_workspace(unsigned int level)
>  	return ERR_PTR(-ENOMEM);
>  }
>  
> +/*
> + * Helper for S390x with hardware zlib compression support.
> + *
> + * That hardware acceleration requires a buffer size larger than a single page
> + * to get ideal performance, thus we need to do the memory copy other than
> + * use the page cache directly as input buffer.
> + */
> +static int copy_data_into_buffer(struct address_space *mapping,
> +				 struct workspace *workspace, u64 filepos,
> +				 unsigned long length)
> +{
> +	u64 cur = filepos;
> +
> +	/* It's only for hardware accelerated zlib code. */
> +	ASSERT(zlib_deflate_dfltcc_enabled());
> +
> +	while (cur < filepos + length) {
> +		struct folio *folio;
> +		void *data_in;
> +		unsigned int offset;
> +		unsigned long copy_length;
> +		int ret;
> +
> +		ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio);
> +		if (ret < 0)
> +			return ret;
> +		/* No larger folio support yet. */
> +		ASSERT(!folio_test_large(folio));
> +
> +		offset = offset_in_folio(folio, cur);
> +		copy_length = min(folio_size(folio) - offset,
> +				  filepos + length - cur);
> +
> +		data_in = kmap_local_folio(folio, offset);
> +		memcpy(workspace->buf + cur - filepos, data_in, copy_length);
> +		kunmap_local(data_in);
> +		cur += copy_length;
> +	}
> +	return 0;
> +}
> +
>  int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
>  			 u64 start, struct folio **folios, unsigned long *out_folios,
>  			 unsigned long *total_in, unsigned long *total_out)
> @@ -105,8 +146,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
>  	int nr_folios = 0;
>  	struct folio *in_folio = NULL;
>  	struct folio *out_folio = NULL;
> -	unsigned long bytes_left;
> -	unsigned int in_buf_folios;
>  	unsigned long len = *total_out;
>  	unsigned long nr_dest_folios = *out_folios;
>  	const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
> @@ -150,34 +189,21 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
>  		 * the workspace buffer if required.
>  		 */
>  		if (workspace->strm.avail_in == 0) {
> -			bytes_left = len - workspace->strm.total_in;
> -			in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
> -					    workspace->buf_size / PAGE_SIZE);
> -			if (in_buf_folios > 1) {
> -				int i;
> +			unsigned long bytes_left = len - workspace->strm.total_in;
> +			unsigned int copy_length = min(bytes_left, workspace->buf_size);
>  
> -				/* S390 hardware acceleration path, not subpage. */
> -				ASSERT(!btrfs_is_subpage(
> -						inode_to_fs_info(mapping->host),
> -						mapping));
> -				for (i = 0; i < in_buf_folios; i++) {
> -					if (data_in) {
> -						kunmap_local(data_in);
> -						folio_put(in_folio);
> -						data_in = NULL;
> -					}
> -					ret = btrfs_compress_filemap_get_folio(mapping,
> -							start, &in_folio);
> -					if (ret < 0)
> -						goto out;
> -					data_in = kmap_local_folio(in_folio, 0);
> -					copy_page(workspace->buf + i * PAGE_SIZE,
> -						  data_in);
> -					start += PAGE_SIZE;
> -				}
> +			/*
> +			 * This can only happen when hardware zlib compression is
> +			 * enabled.
> +			 */
> +			if (copy_length > PAGE_SIZE) {
> +				ret = copy_data_into_buffer(mapping, workspace,
> +							    start, copy_length);
> +				if (ret < 0)
> +					goto out;
> +				start += copy_length;
>  				workspace->strm.next_in = workspace->buf;
> -				workspace->strm.avail_in = min(bytes_left,
> -							       in_buf_folios << PAGE_SHIFT);
> +				workspace->strm.avail_in = copy_length;
>  			} else {
>  				unsigned int pg_off;
>  				unsigned int cur_len;
diff mbox series

Patch

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c9e92c6941ec..7d81d9a0b3a0 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -94,6 +94,47 @@  struct list_head *zlib_alloc_workspace(unsigned int level)
 	return ERR_PTR(-ENOMEM);
 }
 
+/*
+ * Helper for S390x with hardware zlib compression support.
+ *
+ * That hardware acceleration requires a buffer size larger than a single page
+ * to get ideal performance, thus we need to do the memory copy other than
+ * use the page cache directly as input buffer.
+ */
+static int copy_data_into_buffer(struct address_space *mapping,
+				 struct workspace *workspace, u64 filepos,
+				 unsigned long length)
+{
+	u64 cur = filepos;
+
+	/* It's only for hardware accelerated zlib code. */
+	ASSERT(zlib_deflate_dfltcc_enabled());
+
+	while (cur < filepos + length) {
+		struct folio *folio;
+		void *data_in;
+		unsigned int offset;
+		unsigned long copy_length;
+		int ret;
+
+		ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio);
+		if (ret < 0)
+			return ret;
+		/* No larger folio support yet. */
+		ASSERT(!folio_test_large(folio));
+
+		offset = offset_in_folio(folio, cur);
+		copy_length = min(folio_size(folio) - offset,
+				  filepos + length - cur);
+
+		data_in = kmap_local_folio(folio, offset);
+		memcpy(workspace->buf + cur - filepos, data_in, copy_length);
+		kunmap_local(data_in);
+		cur += copy_length;
+	}
+	return 0;
+}
+
 int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 			 u64 start, struct folio **folios, unsigned long *out_folios,
 			 unsigned long *total_in, unsigned long *total_out)
@@ -105,8 +146,6 @@  int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 	int nr_folios = 0;
 	struct folio *in_folio = NULL;
 	struct folio *out_folio = NULL;
-	unsigned long bytes_left;
-	unsigned int in_buf_folios;
 	unsigned long len = *total_out;
 	unsigned long nr_dest_folios = *out_folios;
 	const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
@@ -150,34 +189,21 @@  int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
 		 * the workspace buffer if required.
 		 */
 		if (workspace->strm.avail_in == 0) {
-			bytes_left = len - workspace->strm.total_in;
-			in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
-					    workspace->buf_size / PAGE_SIZE);
-			if (in_buf_folios > 1) {
-				int i;
+			unsigned long bytes_left = len - workspace->strm.total_in;
+			unsigned int copy_length = min(bytes_left, workspace->buf_size);
 
-				/* S390 hardware acceleration path, not subpage. */
-				ASSERT(!btrfs_is_subpage(
-						inode_to_fs_info(mapping->host),
-						mapping));
-				for (i = 0; i < in_buf_folios; i++) {
-					if (data_in) {
-						kunmap_local(data_in);
-						folio_put(in_folio);
-						data_in = NULL;
-					}
-					ret = btrfs_compress_filemap_get_folio(mapping,
-							start, &in_folio);
-					if (ret < 0)
-						goto out;
-					data_in = kmap_local_folio(in_folio, 0);
-					copy_page(workspace->buf + i * PAGE_SIZE,
-						  data_in);
-					start += PAGE_SIZE;
-				}
+			/*
+			 * This can only happen when hardware zlib compression is
+			 * enabled.
+			 */
+			if (copy_length > PAGE_SIZE) {
+				ret = copy_data_into_buffer(mapping, workspace,
+							    start, copy_length);
+				if (ret < 0)
+					goto out;
+				start += copy_length;
 				workspace->strm.next_in = workspace->buf;
-				workspace->strm.avail_in = min(bytes_left,
-							       in_buf_folios << PAGE_SHIFT);
+				workspace->strm.avail_in = copy_length;
 			} else {
 				unsigned int pg_off;
 				unsigned int cur_len;