diff mbox series

[v3,01/14] fs: Move enum rw_hint into a new header file

Message ID 20231017204739.3409052-2-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Pass data temperature information to SCSI disk devices | expand

Commit Message

Bart Van Assche Oct. 17, 2023, 8:47 p.m. UTC
Move enum rw_hint into a new header file to prepare for using this data
type in the block layer. Add the attribute __packed to reduce the space
occupied by instances of this data type from four bytes to one byte.
Change the data type of i_write_hint from u8 into enum rw_hint. Change
the RWH_* constants into literal constants to prevent that
<uapi/linux/fcntl.h> would have to be included.

Cc: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 fs/f2fs/f2fs.h          |  1 +
 fs/fcntl.c              |  1 +
 fs/inode.c              |  1 +
 include/linux/fs.h      | 16 ++--------------
 include/linux/rw_hint.h | 20 ++++++++++++++++++++
 5 files changed, 25 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/rw_hint.h

Comments

Kanchan Joshi Oct. 30, 2023, 11:11 a.m. UTC | #1
On 10/18/2023 2:17 AM, Bart Van Assche wrote:
> - * Write life time hint values.
> - * Stored in struct inode as u8.
> - */
> -enum rw_hint {
> -	WRITE_LIFE_NOT_SET	= 0,
> -	WRITE_LIFE_NONE		= RWH_WRITE_LIFE_NONE,
> -	WRITE_LIFE_SHORT	= RWH_WRITE_LIFE_SHORT,
> -	WRITE_LIFE_MEDIUM	= RWH_WRITE_LIFE_MEDIUM,
> -	WRITE_LIFE_LONG		= RWH_WRITE_LIFE_LONG,
> -	WRITE_LIFE_EXTREME	= RWH_WRITE_LIFE_EXTREME,
> -};
> -
>   /* Match RWF_* bits to IOCB bits */
>   #define IOCB_HIPRI		(__force int) RWF_HIPRI
>   #define IOCB_DSYNC		(__force int) RWF_DSYNC
> @@ -677,7 +665,7 @@ struct inode {
>   	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
>   	unsigned short          i_bytes;
>   	u8			i_blkbits;
> -	u8			i_write_hint;
> +	enum rw_hint		i_write_hint;
>   	blkcnt_t		i_blocks;
>   
>   #ifdef __NEED_I_SIZE_ORDERED
> diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h
> new file mode 100644
> index 000000000000..4a7d28945973
> --- /dev/null
> +++ b/include/linux/rw_hint.h
> @@ -0,0 +1,20 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_RW_HINT_H
> +#define _LINUX_RW_HINT_H
> +
> +#include <linux/build_bug.h>
> +#include <linux/compiler_attributes.h>
> +
> +/* Block storage write lifetime hint values. */
> +enum rw_hint {
> +	WRITE_LIFE_NOT_SET	= 0, /* RWH_WRITE_LIFE_NOT_SET */
> +	WRITE_LIFE_NONE		= 1, /* RWH_WRITE_LIFE_NONE */
> +	WRITE_LIFE_SHORT	= 2, /* RWH_WRITE_LIFE_SHORT */
> +	WRITE_LIFE_MEDIUM	= 3, /* RWH_WRITE_LIFE_MEDIUM */
> +	WRITE_LIFE_LONG		= 4, /* RWH_WRITE_LIFE_LONG */
> +	WRITE_LIFE_EXTREME	= 5, /* RWH_WRITE_LIFE_EXTREME */
> +} __packed;
> +
> +static_assert(sizeof(enum rw_hint) == 1);

Does it make sense to do away with these, and have temperature-neutral 
names instead e.g., WRITE_LIFE_1, WRITE_LIFE_2?

With the current choice:
- If the count goes up (beyond 5 hints), infra can scale fine but these 
names do not. Imagine ULTRA_EXTREME after EXTREME.
- Applications or in-kernel users can specify LONG hint with data that 
actually has a SHORT lifetime. Nothing really ensures that LONG is 
really LONG.

Temperature-neutral names seem more generic/scalable and do not present 
the unnecessary need to be accurate with relative temperatures.
Bart Van Assche Oct. 30, 2023, 4:10 p.m. UTC | #2
On 10/30/23 04:11, Kanchan Joshi wrote:
> On 10/18/2023 2:17 AM, Bart Van Assche wrote:
>> +/* Block storage write lifetime hint values. */
>> +enum rw_hint {
>> +	WRITE_LIFE_NOT_SET	= 0, /* RWH_WRITE_LIFE_NOT_SET */
>> +	WRITE_LIFE_NONE		= 1, /* RWH_WRITE_LIFE_NONE */
>> +	WRITE_LIFE_SHORT	= 2, /* RWH_WRITE_LIFE_SHORT */
>> +	WRITE_LIFE_MEDIUM	= 3, /* RWH_WRITE_LIFE_MEDIUM */
>> +	WRITE_LIFE_LONG		= 4, /* RWH_WRITE_LIFE_LONG */
>> +	WRITE_LIFE_EXTREME	= 5, /* RWH_WRITE_LIFE_EXTREME */
>> +} __packed;
>> +
>> +static_assert(sizeof(enum rw_hint) == 1);
> 
> Does it make sense to do away with these, and have temperature-neutral
> names instead e.g., WRITE_LIFE_1, WRITE_LIFE_2?
> 
> With the current choice:
> - If the count goes up (beyond 5 hints), infra can scale fine but these
> names do not. Imagine ULTRA_EXTREME after EXTREME.
> - Applications or in-kernel users can specify LONG hint with data that
> actually has a SHORT lifetime. Nothing really ensures that LONG is
> really LONG.
> 
> Temperature-neutral names seem more generic/scalable and do not present
> the unnecessary need to be accurate with relative temperatures.

Thanks for having taken a look at this patch series. Jens asked for data
that shows that this patch series improves performance. Is this
something Samsung can help with?

Thanks,

Bart.
Daejun Park Nov. 1, 2023, 6:39 a.m. UTC | #3
Hi Bart,

>On 10/30/23 04:11, Kanchan Joshi wrote:
>> On 10/18/2023 2:17 AM, Bart Van Assche wrote:
>>> +/* Block storage write lifetime hint values. */
>>> +enum rw_hint {
>>> +        WRITE_LIFE_NOT_SET        = 0, /* RWH_WRITE_LIFE_NOT_SET */
>>> +        WRITE_LIFE_NONE                = 1, /* RWH_WRITE_LIFE_NONE */
>>> +        WRITE_LIFE_SHORT        = 2, /* RWH_WRITE_LIFE_SHORT */
>>> +        WRITE_LIFE_MEDIUM        = 3, /* RWH_WRITE_LIFE_MEDIUM */
>>> +        WRITE_LIFE_LONG                = 4, /* RWH_WRITE_LIFE_LONG */
>>> +        WRITE_LIFE_EXTREME        = 5, /* RWH_WRITE_LIFE_EXTREME */
>>> +} __packed;
>>> +
>>> +static_assert(sizeof(enum rw_hint) == 1);
>> 
>> Does it make sense to do away with these, and have temperature-neutral
>> names instead e.g., WRITE_LIFE_1, WRITE_LIFE_2?
>> 
>> With the current choice:
>> - If the count goes up (beyond 5 hints), infra can scale fine but these
>> names do not. Imagine ULTRA_EXTREME after EXTREME.
>> - Applications or in-kernel users can specify LONG hint with data that
>> actually has a SHORT lifetime. Nothing really ensures that LONG is
>> really LONG.
>> 
>> Temperature-neutral names seem more generic/scalable and do not present
>> the unnecessary need to be accurate with relative temperatures.
>
>Thanks for having taken a look at this patch series. Jens asked for data
>that shows that this patch series improves performance. Is this
>something Samsung can help with?

We analyzed the NAND block erase counter with and without stream separation
through a long-term workload in F2FS.
The analysis showed that the erase counter is reduced by approximately 40% 
with stream seperation.
Long-term workload is a scenario where erase and write are repeated by
stream after performing precondition fill for each temperature of F2FS.

Thanks,

Daejun.

>
>Thanks,
>
>Bart.
>

>
Bart Van Assche Nov. 1, 2023, 4:45 p.m. UTC | #4
On 10/31/23 23:39, Daejun Park wrote:
>> On 10/30/23 04:11, Kanchan Joshi wrote:
>>> On 10/18/2023 2:17 AM, Bart Van Assche wrote:
>> Thanks for having taken a look at this patch series. Jens asked for data
>> that shows that this patch series improves performance. Is this
>> something Samsung can help with?
> 
> We analyzed the NAND block erase counter with and without stream separation
> through a long-term workload in F2FS.
> The analysis showed that the erase counter is reduced by approximately 40%
> with stream seperation.
> Long-term workload is a scenario where erase and write are repeated by
> stream after performing precondition fill for each temperature of F2FS.

Hi Daejun,

Thank you for having shared this data. This is very helpful. Since I'm
not familiar with the erase counter: does the above data perhaps mean
that write amplification is reduced by 40% in the workload that has been
examined?

Thanks,

Bart.
Daejun Park Nov. 2, 2023, 7:31 a.m. UTC | #5
Hi Bart,

>On 10/31/23 23:39, Daejun Park wrote:
>>> On 10/30/23 04:11, Kanchan Joshi wrote:
>>>> On 10/18/2023 2:17 AM, Bart Van Assche wrote:
>>> Thanks for having taken a look at this patch series. Jens asked for data
>>> that shows that this patch series improves performance. Is this
>>> something Samsung can help with?
>> 
>> We analyzed the NAND block erase counter with and without stream separation
>> through a long-term workload in F2FS.
>> The analysis showed that the erase counter is reduced by approximately 40%
>> with stream seperation.
>> Long-term workload is a scenario where erase and write are repeated by
>> stream after performing precondition fill for each temperature of F2FS.
>
>Hi Daejun,
>
>Thank you for having shared this data. This is very helpful. Since I'm
>not familiar with the erase counter: does the above data perhaps mean
>that write amplification is reduced by 40% in the workload that has been
>examined?

WAF is not only caused by GC. It is also caused by other reasons.
During device GC, the valid pages in the victim block are migrated, and a
lower erase counter means that the effective GC is performed by selecting
a victim block with a small number of invalid pages.
Thus, it can be said that the WAF can be decreased about 40% by selecting
fewer victim blocks during device GC.

Thanks,

Daejun

>
>Thanks,
>
>Bart.
diff mbox series

Patch

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6d688e42d89c..56ee7fff55c7 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,6 +24,7 @@ 
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/part_stat.h>
+#include <linux/rw_hint.h>
 #include <crypto/hash.h>
 
 #include <linux/fscrypt.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index e871009f6c88..ed923640aecf 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -27,6 +27,7 @@ 
 #include <linux/memfd.h>
 #include <linux/compat.h>
 #include <linux/mount.h>
+#include <linux/rw_hint.h>
 
 #include <linux/poll.h>
 #include <asm/siginfo.h>
diff --git a/fs/inode.c b/fs/inode.c
index 84bc3c76e5cc..ebcc41ac9682 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,6 +20,7 @@ 
 #include <linux/ratelimit.h>
 #include <linux/list_lru.h>
 #include <linux/iversion.h>
+#include <linux/rw_hint.h>
 #include <trace/events/writeback.h>
 #include "internal.h"
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..971f0bafa782 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -43,6 +43,7 @@ 
 #include <linux/cred.h>
 #include <linux/mnt_idmapping.h>
 #include <linux/slab.h>
+#include <linux/rw_hint.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -309,19 +310,6 @@  struct address_space;
 struct writeback_control;
 struct readahead_control;
 
-/*
- * Write life time hint values.
- * Stored in struct inode as u8.
- */
-enum rw_hint {
-	WRITE_LIFE_NOT_SET	= 0,
-	WRITE_LIFE_NONE		= RWH_WRITE_LIFE_NONE,
-	WRITE_LIFE_SHORT	= RWH_WRITE_LIFE_SHORT,
-	WRITE_LIFE_MEDIUM	= RWH_WRITE_LIFE_MEDIUM,
-	WRITE_LIFE_LONG		= RWH_WRITE_LIFE_LONG,
-	WRITE_LIFE_EXTREME	= RWH_WRITE_LIFE_EXTREME,
-};
-
 /* Match RWF_* bits to IOCB bits */
 #define IOCB_HIPRI		(__force int) RWF_HIPRI
 #define IOCB_DSYNC		(__force int) RWF_DSYNC
@@ -677,7 +665,7 @@  struct inode {
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	unsigned short          i_bytes;
 	u8			i_blkbits;
-	u8			i_write_hint;
+	enum rw_hint		i_write_hint;
 	blkcnt_t		i_blocks;
 
 #ifdef __NEED_I_SIZE_ORDERED
diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h
new file mode 100644
index 000000000000..4a7d28945973
--- /dev/null
+++ b/include/linux/rw_hint.h
@@ -0,0 +1,20 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RW_HINT_H
+#define _LINUX_RW_HINT_H
+
+#include <linux/build_bug.h>
+#include <linux/compiler_attributes.h>
+
+/* Block storage write lifetime hint values. */
+enum rw_hint {
+	WRITE_LIFE_NOT_SET	= 0, /* RWH_WRITE_LIFE_NOT_SET */
+	WRITE_LIFE_NONE		= 1, /* RWH_WRITE_LIFE_NONE */
+	WRITE_LIFE_SHORT	= 2, /* RWH_WRITE_LIFE_SHORT */
+	WRITE_LIFE_MEDIUM	= 3, /* RWH_WRITE_LIFE_MEDIUM */
+	WRITE_LIFE_LONG		= 4, /* RWH_WRITE_LIFE_LONG */
+	WRITE_LIFE_EXTREME	= 5, /* RWH_WRITE_LIFE_EXTREME */
+} __packed;
+
+static_assert(sizeof(enum rw_hint) == 1);
+
+#endif /* _LINUX_RW_HINT_H */