diff mbox series

[RFC] ceph: serialize the direct writes when couldn't submit in a single req

Message ID 20200204015445.4435-1-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series [RFC] ceph: serialize the direct writes when couldn't submit in a single req | expand

Commit Message

Xiubo Li Feb. 4, 2020, 1:54 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

If the direct io couldn't be submit in a single request, for multiple
writers, they may overlap each other.

For example, with the file layout:
ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304

fd = open(, O_DIRECT | O_WRONLY, );

Writer1:
posix_memalign(&buffer, 4194304, SIZE);
memset(buffer, 'T', SIZE);
write(fd, buffer, SIZE);

Writer2:
posix_memalign(&buffer, 4194304, SIZE);
memset(buffer, 'X', SIZE);
write(fd, buffer, SIZE);

From the test result, the data in the file possiblly will be:
TTT...TTT <---> object1
XXX...XXX <---> object2

The expected result should be all "XX.." or "TT.." in both object1
and object2.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/file.c  | 38 +++++++++++++++++++++++++++++++++++---
 fs/ceph/inode.c |  2 ++
 fs/ceph/super.h |  3 +++
 3 files changed, 40 insertions(+), 3 deletions(-)

Comments

Jeff Layton Feb. 5, 2020, 4:24 p.m. UTC | #1
On Mon, 2020-02-03 at 20:54 -0500, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> If the direct io couldn't be submit in a single request, for multiple
> writers, they may overlap each other.
> 
> For example, with the file layout:
> ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304
> 
> fd = open(, O_DIRECT | O_WRONLY, );
> 
> Writer1:
> posix_memalign(&buffer, 4194304, SIZE);
> memset(buffer, 'T', SIZE);
> write(fd, buffer, SIZE);
> 
> Writer2:
> posix_memalign(&buffer, 4194304, SIZE);
> memset(buffer, 'X', SIZE);
> write(fd, buffer, SIZE);
> 
> From the test result, the data in the file possiblly will be:
> TTT...TTT <---> object1
> XXX...XXX <---> object2
> 
> The expected result should be all "XX.." or "TT.." in both object1
> and object2.
> 

I really don't see this as broken. If you're using O_DIRECT, I don't
believe there is any expectation that the write operations (or even read
operations) will be atomic wrt to one another.

Basically, when you do this, you're saying "I know what I'm doing", and
need to provide synchronization yourself between competing applications
and clients (typically via file locking).


> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/file.c  | 38 +++++++++++++++++++++++++++++++++++---
>  fs/ceph/inode.c |  2 ++
>  fs/ceph/super.h |  3 +++
>  3 files changed, 40 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 1cedba452a66..2741070a58a9 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -961,6 +961,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  	loff_t pos = iocb->ki_pos;
>  	bool write = iov_iter_rw(iter) == WRITE;
>  	bool should_dirty = !write && iter_is_iovec(iter);
> +	bool shared_lock = false;
> +	u64 size;
>  
>  	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
>  		return -EROFS;
> @@ -977,14 +979,27 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  			dout("invalidate_inode_pages2_range returned %d\n", ret2);
>  
>  		flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
> +
> +		/*
> +		 * If we cannot submit the whole iter in a single request we
> +		 * should block all the requests followed to avoid the data
> +		 * being overlapped by each other.
> +		 *
> +		 * But for those which could be submit in an single request
> +		 * they could excute in parallel.
> +		 *
> +		 * Hold the exclusive lock first.
> +		 */
> +		down_write(&ci->i_direct_rwsem);
>  	} else {
>  		flags = CEPH_OSD_FLAG_READ;
>  	}
>  
>  	while (iov_iter_count(iter) > 0) {
> -		u64 size = iov_iter_count(iter);
>  		ssize_t len;
>  
> +		size = iov_iter_count(iter);
> +
>  		if (write)
>  			size = min_t(u64, size, fsc->mount_options->wsize);
>  		else
> @@ -1011,9 +1026,16 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  			ret = len;
>  			break;
>  		}
> +
>  		if (len != size)
>  			osd_req_op_extent_update(req, 0, len);
>  
> +		if (write && pos == iocb->ki_pos && len == count) {
> +			/* Switch to shared lock */
> +			downgrade_write(&ci->i_direct_rwsem);
> +			shared_lock = true;
> +		}
> +
>  		/*
>  		 * To simplify error handling, allow AIO when IO within i_size
>  		 * or IO can be satisfied by single OSD request.
> @@ -1110,7 +1132,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  
>  		if (aio_req->num_reqs == 0) {
>  			kfree(aio_req);
> -			return ret;
> +			goto unlock;
>  		}
>  
>  		ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
> @@ -1131,13 +1153,23 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  				ceph_aio_complete_req(req);
>  			}
>  		}
> -		return -EIOCBQUEUED;
> +		ret = -EIOCBQUEUED;
> +		goto unlock;
>  	}
>  
>  	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
>  		ret = pos - iocb->ki_pos;
>  		iocb->ki_pos = pos;
>  	}
> +
> +unlock:
> +	if (write) {
> +		if (shared_lock)
> +			up_read(&ci->i_direct_rwsem);
> +		else
> +			up_write(&ci->i_direct_rwsem);
> +	}
> +
>  	return ret;
>  }
>  
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index aee7a24bf1bc..e5d634acd273 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -518,6 +518,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  
>  	ceph_fscache_inode_init(ci);
>  
> +	init_rwsem(&ci->i_direct_rwsem);
> +
>  	ci->i_meta_err = 0;
>  
>  	return &ci->vfs_inode;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index ee81920bb1a4..213c11bf41be 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -409,6 +409,9 @@ struct ceph_inode_info {
>  	struct fscache_cookie *fscache;
>  	u32 i_fscache_gen;
>  #endif
> +
> +	struct rw_semaphore i_direct_rwsem;
> +
>  	errseq_t i_meta_err;
>  
>  	struct inode vfs_inode; /* at end */
Jeff Layton Feb. 5, 2020, 4:38 p.m. UTC | #2
On Wed, 2020-02-05 at 11:24 -0500, Jeff Layton wrote:
> On Mon, 2020-02-03 at 20:54 -0500, xiubli@redhat.com wrote:
> > From: Xiubo Li <xiubli@redhat.com>
> > 
> > If the direct io couldn't be submit in a single request, for multiple
> > writers, they may overlap each other.
> > 
> > For example, with the file layout:
> > ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304
> > 
> > fd = open(, O_DIRECT | O_WRONLY, );
> > 
> > Writer1:
> > posix_memalign(&buffer, 4194304, SIZE);
> > memset(buffer, 'T', SIZE);
> > write(fd, buffer, SIZE);
> > 
> > Writer2:
> > posix_memalign(&buffer, 4194304, SIZE);
> > memset(buffer, 'X', SIZE);
> > write(fd, buffer, SIZE);
> > 
> > From the test result, the data in the file possiblly will be:
> > TTT...TTT <---> object1
> > XXX...XXX <---> object2
> > 
> > The expected result should be all "XX.." or "TT.." in both object1
> > and object2.
> > 
> 
> I really don't see this as broken. If you're using O_DIRECT, I don't
> believe there is any expectation that the write operations (or even read
> operations) will be atomic wrt to one another.
> 
> Basically, when you do this, you're saying "I know what I'm doing", and
> need to provide synchronization yourself between competing applications
> and clients (typically via file locking).
> 

In fact, here's a discussion about this from a couple of years ago. This
one mostly applies to local filesystems, but the same concepts apply to
CephFS as well:


https://www.spinics.net/lists/linux-fsdevel/msg118115.html
Xiubo Li Feb. 6, 2020, 2:51 a.m. UTC | #3
On 2020/2/6 0:38, Jeff Layton wrote:
> On Wed, 2020-02-05 at 11:24 -0500, Jeff Layton wrote:
>> On Mon, 2020-02-03 at 20:54 -0500, xiubli@redhat.com wrote:
>>> From: Xiubo Li <xiubli@redhat.com>
>>>
>>> If the direct io couldn't be submit in a single request, for multiple
>>> writers, they may overlap each other.
>>>
>>> For example, with the file layout:
>>> ceph.file.layout="stripe_unit=4194304 stripe_count=1 object_size=4194304
>>>
>>> fd = open(, O_DIRECT | O_WRONLY, );
>>>
>>> Writer1:
>>> posix_memalign(&buffer, 4194304, SIZE);
>>> memset(buffer, 'T', SIZE);
>>> write(fd, buffer, SIZE);
>>>
>>> Writer2:
>>> posix_memalign(&buffer, 4194304, SIZE);
>>> memset(buffer, 'X', SIZE);
>>> write(fd, buffer, SIZE);
>>>
>>>  From the test result, the data in the file possiblly will be:
>>> TTT...TTT <---> object1
>>> XXX...XXX <---> object2
>>>
>>> The expected result should be all "XX.." or "TT.." in both object1
>>> and object2.
>>>
>> I really don't see this as broken. If you're using O_DIRECT, I don't
>> believe there is any expectation that the write operations (or even read
>> operations) will be atomic wrt to one another.
>>
>> Basically, when you do this, you're saying "I know what I'm doing", and
>> need to provide synchronization yourself between competing applications
>> and clients (typically via file locking).
>>
> In fact, here's a discussion about this from a couple of years ago. This
> one mostly applies to local filesystems, but the same concepts apply to
> CephFS as well:
>
>
> https://www.spinics.net/lists/linux-fsdevel/msg118115.html
>
Okay. So for the O_DIRECT write/read, we won't guarantee it will be 
atomic in fs layer.

Thanks,
diff mbox series

Patch

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1cedba452a66..2741070a58a9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -961,6 +961,8 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t pos = iocb->ki_pos;
 	bool write = iov_iter_rw(iter) == WRITE;
 	bool should_dirty = !write && iter_is_iovec(iter);
+	bool shared_lock = false;
+	u64 size;
 
 	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
@@ -977,14 +979,27 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			dout("invalidate_inode_pages2_range returned %d\n", ret2);
 
 		flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
+
+		/*
+		 * If we cannot submit the whole iter in a single request we
+		 * should block all the requests followed to avoid the data
+		 * being overlapped by each other.
+		 *
+		 * But for those which could be submit in an single request
+		 * they could excute in parallel.
+		 *
+		 * Hold the exclusive lock first.
+		 */
+		down_write(&ci->i_direct_rwsem);
 	} else {
 		flags = CEPH_OSD_FLAG_READ;
 	}
 
 	while (iov_iter_count(iter) > 0) {
-		u64 size = iov_iter_count(iter);
 		ssize_t len;
 
+		size = iov_iter_count(iter);
+
 		if (write)
 			size = min_t(u64, size, fsc->mount_options->wsize);
 		else
@@ -1011,9 +1026,16 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			ret = len;
 			break;
 		}
+
 		if (len != size)
 			osd_req_op_extent_update(req, 0, len);
 
+		if (write && pos == iocb->ki_pos && len == count) {
+			/* Switch to shared lock */
+			downgrade_write(&ci->i_direct_rwsem);
+			shared_lock = true;
+		}
+
 		/*
 		 * To simplify error handling, allow AIO when IO within i_size
 		 * or IO can be satisfied by single OSD request.
@@ -1110,7 +1132,7 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 
 		if (aio_req->num_reqs == 0) {
 			kfree(aio_req);
-			return ret;
+			goto unlock;
 		}
 
 		ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
@@ -1131,13 +1153,23 @@  ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 				ceph_aio_complete_req(req);
 			}
 		}
-		return -EIOCBQUEUED;
+		ret = -EIOCBQUEUED;
+		goto unlock;
 	}
 
 	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
 		ret = pos - iocb->ki_pos;
 		iocb->ki_pos = pos;
 	}
+
+unlock:
+	if (write) {
+		if (shared_lock)
+			up_read(&ci->i_direct_rwsem);
+		else
+			up_write(&ci->i_direct_rwsem);
+	}
+
 	return ret;
 }
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index aee7a24bf1bc..e5d634acd273 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -518,6 +518,8 @@  struct inode *ceph_alloc_inode(struct super_block *sb)
 
 	ceph_fscache_inode_init(ci);
 
+	init_rwsem(&ci->i_direct_rwsem);
+
 	ci->i_meta_err = 0;
 
 	return &ci->vfs_inode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ee81920bb1a4..213c11bf41be 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -409,6 +409,9 @@  struct ceph_inode_info {
 	struct fscache_cookie *fscache;
 	u32 i_fscache_gen;
 #endif
+
+	struct rw_semaphore i_direct_rwsem;
+
 	errseq_t i_meta_err;
 
 	struct inode vfs_inode; /* at end */