diff mbox series

[RFC,02/16] fs/bdev: Add atomic write support info to statx

Message ID 20230503183821.1473305-3-john.g.garry@oracle.com (mailing list archive)
State New, archived
Headers show
Series block atomic writes | expand

Commit Message

John Garry May 3, 2023, 6:38 p.m. UTC
From: Prasad Singamsetty <prasad.singamsetty@oracle.com>

Extend statx system call to return additional info for atomic write support
support if the specified file is a block device.

Add initial support for a block device.

Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 block/bdev.c              | 21 +++++++++++++++++++++
 fs/stat.c                 | 10 ++++++++++
 include/linux/blkdev.h    |  4 ++++
 include/linux/stat.h      |  2 ++
 include/uapi/linux/stat.h |  7 ++++++-
 5 files changed, 43 insertions(+), 1 deletion(-)

Comments

Dave Chinner May 3, 2023, 9:58 p.m. UTC | #1
On Wed, May 03, 2023 at 06:38:07PM +0000, John Garry wrote:
> From: Prasad Singamsetty <prasad.singamsetty@oracle.com>
> 
> Extend statx system call to return additional info for atomic write support
> support if the specified file is a block device.
> 
> Add initial support for a block device.
> 
> Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  block/bdev.c              | 21 +++++++++++++++++++++
>  fs/stat.c                 | 10 ++++++++++
>  include/linux/blkdev.h    |  4 ++++
>  include/linux/stat.h      |  2 ++
>  include/uapi/linux/stat.h |  7 ++++++-
>  5 files changed, 43 insertions(+), 1 deletion(-)
> 
> diff --git a/block/bdev.c b/block/bdev.c
> index 1795c7d4b99e..6a5fd5abaadc 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -1014,3 +1014,24 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
>  
>  	blkdev_put_no_open(bdev);
>  }
> +
> +/*
> + * Handle statx for block devices to get properties of WRITE ATOMIC
> + * feature support.
> + */
> +void bdev_statx_atomic(struct inode *inode, struct kstat *stat)
> +{
> +	struct block_device *bdev;
> +
> +	bdev = blkdev_get_no_open(inode->i_rdev);
> +	if (!bdev)
> +		return;
> +
> +	stat->atomic_write_unit_min = queue_atomic_write_unit_min(bdev->bd_queue);
> +	stat->atomic_write_unit_max = queue_atomic_write_unit_max(bdev->bd_queue);
> +	stat->attributes |= STATX_ATTR_WRITE_ATOMIC;
> +	stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC;
> +	stat->result_mask |= STATX_WRITE_ATOMIC;
> +
> +	blkdev_put_no_open(bdev);
> +}
> diff --git a/fs/stat.c b/fs/stat.c
> index 7c238da22ef0..d20334a0e9ae 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -256,6 +256,14 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
>  			bdev_statx_dioalign(inode, stat);
>  	}
>  
> +	/* Handle STATX_WRITE_ATOMIC for block devices */
> +	if (request_mask & STATX_WRITE_ATOMIC) {
> +		struct inode *inode = d_backing_inode(path.dentry);
> +
> +		if (S_ISBLK(inode->i_mode))
> +			bdev_statx_atomic(inode, stat);
> +	}

This duplicates STATX_DIOALIGN bdev handling.

Really, the bdev attribute handling should be completely factored
out of vfs_statx() - blockdevs are not the common fastpath for stat
operations. Somthing like:

	/*
	 * If this is a block device inode, override the filesystem
	 * attributes with the block device specific parameters
	 * that need to be obtained from the bdev backing inode.
	 */
	if (S_ISBLK(d_backing_inode(path.dentry)->i_mode))
		bdev_statx(path.dentry, stat);

And then all the overrides can go in the one function that doesn't
need to repeatedly check S_ISBLK()....


> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 6b6f2992338c..19d33b2897b2 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1527,6 +1527,7 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
>  int sync_blockdev_nowait(struct block_device *bdev);
>  void sync_bdevs(bool wait);
>  void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
> +void bdev_statx_atomic(struct inode *inode, struct kstat *stat);
>  void printk_all_partitions(void);
>  #else
>  static inline void invalidate_bdev(struct block_device *bdev)
> @@ -1546,6 +1547,9 @@ static inline void sync_bdevs(bool wait)
>  static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
>  {
>  }
> +static inline void bdev_statx_atomic(struct inode *inode, struct kstat *stat)
> +{
> +}
>  static inline void printk_all_partitions(void)
>  {
>  }

That also gets rid of the need for all these fine grained exports
out of the bdev code for statx....

> diff --git a/include/linux/stat.h b/include/linux/stat.h
> index 52150570d37a..dfa69ecfaacf 100644
> --- a/include/linux/stat.h
> +++ b/include/linux/stat.h
> @@ -53,6 +53,8 @@ struct kstat {
>  	u32		dio_mem_align;
>  	u32		dio_offset_align;
>  	u64		change_cookie;
> +	u32		atomic_write_unit_max;
> +	u32		atomic_write_unit_min;
>  };
>  
>  /* These definitions are internal to the kernel for now. Mainly used by nfsd. */
> diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
> index 7cab2c65d3d7..c99d7cac2aa6 100644
> --- a/include/uapi/linux/stat.h
> +++ b/include/uapi/linux/stat.h
> @@ -127,7 +127,10 @@ struct statx {
>  	__u32	stx_dio_mem_align;	/* Memory buffer alignment for direct I/O */
>  	__u32	stx_dio_offset_align;	/* File offset alignment for direct I/O */
>  	/* 0xa0 */
> -	__u64	__spare3[12];	/* Spare space for future expansion */
> +	__u32	stx_atomic_write_unit_max;
> +	__u32	stx_atomic_write_unit_min;
> +	/* 0xb0 */
> +	__u64	__spare3[11];	/* Spare space for future expansion */
>  	/* 0x100 */
>  };

No documentation on what units these are in. Is there a statx() man
page update for this addition?

Cheers,

Dave.
John Garry May 4, 2023, 8:45 a.m. UTC | #2
On 03/05/2023 22:58, Dave Chinner wrote:

Hi Dave,

>> +	/* Handle STATX_WRITE_ATOMIC for block devices */
>> +	if (request_mask & STATX_WRITE_ATOMIC) {
>> +		struct inode *inode = d_backing_inode(path.dentry);
>> +
>> +		if (S_ISBLK(inode->i_mode))
>> +			bdev_statx_atomic(inode, stat);
>> +	}
> This duplicates STATX_DIOALIGN bdev handling.
> 
> Really, the bdev attribute handling should be completely factored
> out of vfs_statx() - blockdevs are not the common fastpath for stat
> operations. Somthing like:
> 
> 	/*
> 	 * If this is a block device inode, override the filesystem
> 	 * attributes with the block device specific parameters
> 	 * that need to be obtained from the bdev backing inode.
> 	 */
> 	if (S_ISBLK(d_backing_inode(path.dentry)->i_mode))
> 		bdev_statx(path.dentry, stat);
> 
> And then all the overrides can go in the one function that doesn't
> need to repeatedly check S_ISBLK()....

ok, that looks like a reasonable idea. We'll look to make that change.

> 
> 
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 6b6f2992338c..19d33b2897b2 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -1527,6 +1527,7 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
>>   int sync_blockdev_nowait(struct block_device *bdev);
>>   void sync_bdevs(bool wait);
>>   void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
>> +void bdev_statx_atomic(struct inode *inode, struct kstat *stat);
>>   void printk_all_partitions(void);
>>   #else
>>   static inline void invalidate_bdev(struct block_device *bdev)
>> @@ -1546,6 +1547,9 @@ static inline void sync_bdevs(bool wait)
>>   static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
>>   {
>>   }
>> +static inline void bdev_statx_atomic(struct inode *inode, struct kstat *stat)
>> +{
>> +}
>>   static inline void printk_all_partitions(void)
>>   {
>>   }
> That also gets rid of the need for all these fine grained exports
> out of the bdev code for statx....

Sure

> 
>> diff --git a/include/linux/stat.h b/include/linux/stat.h
>> index 52150570d37a..dfa69ecfaacf 100644
>> --- a/include/linux/stat.h
>> +++ b/include/linux/stat.h
>> @@ -53,6 +53,8 @@ struct kstat {
>>   	u32		dio_mem_align;
>>   	u32		dio_offset_align;
>>   	u64		change_cookie;
>> +	u32		atomic_write_unit_max;
>> +	u32		atomic_write_unit_min;
>>   };
>>   
>>   /* These definitions are internal to the kernel for now. Mainly used by nfsd. */
>> diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
>> index 7cab2c65d3d7..c99d7cac2aa6 100644
>> --- a/include/uapi/linux/stat.h
>> +++ b/include/uapi/linux/stat.h
>> @@ -127,7 +127,10 @@ struct statx {
>>   	__u32	stx_dio_mem_align;	/* Memory buffer alignment for direct I/O */
>>   	__u32	stx_dio_offset_align;	/* File offset alignment for direct I/O */
>>   	/* 0xa0 */
>> -	__u64	__spare3[12];	/* Spare space for future expansion */
>> +	__u32	stx_atomic_write_unit_max;
>> +	__u32	stx_atomic_write_unit_min;
>> +	/* 0xb0 */
>> +	__u64	__spare3[11];	/* Spare space for future expansion */
>>   	/* 0x100 */
>>   };
> No documentation on what units these are in.

It's in bytes, we're really just continuing the pattern of what we do 
for dio. I think that it would be reasonable to limit to u32.

> Is there a statx() man
> page update for this addition?

No, not yet. Is it normally expected to provide a proposed man page 
update in parallel? Or somewhat later, when the kernel API change has 
some appreciable level of agreement?

Thanks,
John
Dave Chinner May 4, 2023, 10:40 p.m. UTC | #3
On Thu, May 04, 2023 at 09:45:50AM +0100, John Garry wrote:
> On 03/05/2023 22:58, Dave Chinner wrote:
> > Is there a statx() man
> > page update for this addition?
> 
> No, not yet. Is it normally expected to provide a proposed man page update
> in parallel? Or somewhat later, when the kernel API change has some
> appreciable level of agreement?

Normally we ask for man page updates to be presented at the same
time, as the man page defines the user interface that is being
implemented. In this case, we need updates for the pwritev2() man
page to document RWF_ATOMIC semantics, and the statx() man page to
document what the variables being exposed mean w.r.t. RWF_ATOMIC.

The pwritev2() man page is probably the most important one right now
- it needs to explain the guarantees that RWF_ATOMIC is supposed to
provide w.r.t. data integrity, IO ordering, persistence, etc.
Indeed, it will need to explain exactly how this "multi-atomic-unit
mulit-bio non-atomic RWF_ATOMIC" IO thing can be used safely and
reliably, especially w.r.t. IO ordering and persistence guarantees
in the face of crashes and power failures. Not to mention
documenting error conditions specific to RWF_ATOMIC...

It's all well and good to have some implementation, but without
actually defining and documenting the *guarantees* that RWF_ATOMIC
provides userspace it is completely useless for application
developers. And from the perspective of a reviewer, without the
documentation stating what the infrastructure actually guarantees
applications, we can't determine if the implementation being
presented is fit for purpose....

Cheers,

Dave.
John Garry May 5, 2023, 8:01 a.m. UTC | #4
On 04/05/2023 23:40, Dave Chinner wrote:

Hi Dave,

>> No, not yet. Is it normally expected to provide a proposed man page update
>> in parallel? Or somewhat later, when the kernel API change has some
>> appreciable level of agreement?
> Normally we ask for man page updates to be presented at the same
> time, as the man page defines the user interface that is being
> implemented. In this case, we need updates for the pwritev2() man
> page to document RWF_ATOMIC semantics, and the statx() man page to
> document what the variables being exposed mean w.r.t. RWF_ATOMIC.
> 
> The pwritev2() man page is probably the most important one right now
> - it needs to explain the guarantees that RWF_ATOMIC is supposed to
> provide w.r.t. data integrity, IO ordering, persistence, etc.
> Indeed, it will need to explain exactly how this "multi-atomic-unit
> mulit-bio non-atomic RWF_ATOMIC" IO thing can be used safely and
> reliably, especially w.r.t. IO ordering and persistence guarantees
> in the face of crashes and power failures. Not to mention
> documenting error conditions specific to RWF_ATOMIC...
> 
> It's all well and good to have some implementation, but without
> actually defining and documenting the*guarantees*  that RWF_ATOMIC
> provides userspace it is completely useless for application
> developers. And from the perspective of a reviewer, without the
> documentation stating what the infrastructure actually guarantees
> applications, we can't determine if the implementation being
> presented is fit for purpose....

ok, understood. Obviously from any discussion so far there are many 
details which the user needs to know about how to use this interface and 
what to expect.

We'll look to start working on those man page details now.

Thanks,
John
Darrick J. Wong May 5, 2023, 10:04 p.m. UTC | #5
On Fri, May 05, 2023 at 09:01:58AM +0100, John Garry wrote:
> On 04/05/2023 23:40, Dave Chinner wrote:
> 
> Hi Dave,
> 
> > > No, not yet. Is it normally expected to provide a proposed man page update
> > > in parallel? Or somewhat later, when the kernel API change has some
> > > appreciable level of agreement?
> > Normally we ask for man page updates to be presented at the same
> > time, as the man page defines the user interface that is being
> > implemented. In this case, we need updates for the pwritev2() man
> > page to document RWF_ATOMIC semantics, and the statx() man page to
> > document what the variables being exposed mean w.r.t. RWF_ATOMIC.
> > 
> > The pwritev2() man page is probably the most important one right now
> > - it needs to explain the guarantees that RWF_ATOMIC is supposed to
> > provide w.r.t. data integrity, IO ordering, persistence, etc.
> > Indeed, it will need to explain exactly how this "multi-atomic-unit
> > mulit-bio non-atomic RWF_ATOMIC" IO thing can be used safely and
> > reliably, especially w.r.t. IO ordering and persistence guarantees
> > in the face of crashes and power failures. Not to mention
> > documenting error conditions specific to RWF_ATOMIC...
> > 
> > It's all well and good to have some implementation, but without
> > actually defining and documenting the*guarantees*  that RWF_ATOMIC
> > provides userspace it is completely useless for application
> > developers. And from the perspective of a reviewer, without the
> > documentation stating what the infrastructure actually guarantees
> > applications, we can't determine if the implementation being
> > presented is fit for purpose....
> 
> ok, understood. Obviously from any discussion so far there are many details
> which the user needs to know about how to use this interface and what to
> expect.
> 
> We'll look to start working on those man page details now.

Agreed.  The manpage contents are what needs to get worked on at LSFMM
where you'll have various block/fs/storage device people in the same
room with which to discuss various issues and try to smooth out the
misundertandings.

(Also: I've decided to cancel my in-person attendance due to a sudden
health issue.   I'll still be in the room, just virtually now. :()

--D

> Thanks,
> John
diff mbox series

Patch

diff --git a/block/bdev.c b/block/bdev.c
index 1795c7d4b99e..6a5fd5abaadc 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1014,3 +1014,24 @@  void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
 
 	blkdev_put_no_open(bdev);
 }
+
+/*
+ * Handle statx for block devices to get properties of WRITE ATOMIC
+ * feature support.
+ */
+void bdev_statx_atomic(struct inode *inode, struct kstat *stat)
+{
+	struct block_device *bdev;
+
+	bdev = blkdev_get_no_open(inode->i_rdev);
+	if (!bdev)
+		return;
+
+	stat->atomic_write_unit_min = queue_atomic_write_unit_min(bdev->bd_queue);
+	stat->atomic_write_unit_max = queue_atomic_write_unit_max(bdev->bd_queue);
+	stat->attributes |= STATX_ATTR_WRITE_ATOMIC;
+	stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC;
+	stat->result_mask |= STATX_WRITE_ATOMIC;
+
+	blkdev_put_no_open(bdev);
+}
diff --git a/fs/stat.c b/fs/stat.c
index 7c238da22ef0..d20334a0e9ae 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -256,6 +256,14 @@  static int vfs_statx(int dfd, struct filename *filename, int flags,
 			bdev_statx_dioalign(inode, stat);
 	}
 
+	/* Handle STATX_WRITE_ATOMIC for block devices */
+	if (request_mask & STATX_WRITE_ATOMIC) {
+		struct inode *inode = d_backing_inode(path.dentry);
+
+		if (S_ISBLK(inode->i_mode))
+			bdev_statx_atomic(inode, stat);
+	}
+
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -636,6 +644,8 @@  cp_statx(const struct kstat *stat, struct statx __user *buffer)
 	tmp.stx_mnt_id = stat->mnt_id;
 	tmp.stx_dio_mem_align = stat->dio_mem_align;
 	tmp.stx_dio_offset_align = stat->dio_offset_align;
+	tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
+	tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
 
 	return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6b6f2992338c..19d33b2897b2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1527,6 +1527,7 @@  int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
 int sync_blockdev_nowait(struct block_device *bdev);
 void sync_bdevs(bool wait);
 void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
+void bdev_statx_atomic(struct inode *inode, struct kstat *stat);
 void printk_all_partitions(void);
 #else
 static inline void invalidate_bdev(struct block_device *bdev)
@@ -1546,6 +1547,9 @@  static inline void sync_bdevs(bool wait)
 static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
 {
 }
+static inline void bdev_statx_atomic(struct inode *inode, struct kstat *stat)
+{
+}
 static inline void printk_all_partitions(void)
 {
 }
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 52150570d37a..dfa69ecfaacf 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -53,6 +53,8 @@  struct kstat {
 	u32		dio_mem_align;
 	u32		dio_offset_align;
 	u64		change_cookie;
+	u32		atomic_write_unit_max;
+	u32		atomic_write_unit_min;
 };
 
 /* These definitions are internal to the kernel for now. Mainly used by nfsd. */
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 7cab2c65d3d7..c99d7cac2aa6 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -127,7 +127,10 @@  struct statx {
 	__u32	stx_dio_mem_align;	/* Memory buffer alignment for direct I/O */
 	__u32	stx_dio_offset_align;	/* File offset alignment for direct I/O */
 	/* 0xa0 */
-	__u64	__spare3[12];	/* Spare space for future expansion */
+	__u32	stx_atomic_write_unit_max;
+	__u32	stx_atomic_write_unit_min;
+	/* 0xb0 */
+	__u64	__spare3[11];	/* Spare space for future expansion */
 	/* 0x100 */
 };
 
@@ -154,6 +157,7 @@  struct statx {
 #define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
 #define STATX_MNT_ID		0x00001000U	/* Got stx_mnt_id */
 #define STATX_DIOALIGN		0x00002000U	/* Want/got direct I/O alignment info */
+#define STATX_WRITE_ATOMIC	0x00004000U	/* Want/got atomic_write_* fields */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
@@ -189,6 +193,7 @@  struct statx {
 #define STATX_ATTR_MOUNT_ROOT		0x00002000 /* Root of a mount */
 #define STATX_ATTR_VERITY		0x00100000 /* [I] Verity protected file */
 #define STATX_ATTR_DAX			0x00200000 /* File is currently in DAX state */
+#define STATX_ATTR_WRITE_ATOMIC		0x00400000 /* File supports atomic write operations */
 
 
 #endif /* _UAPI_LINUX_STAT_H */