diff mbox

[v2] pnfs/blocklayout: update last_write_offset atomically with extents

Message ID d589c4985ebf916dcaf70a1c5e3bea809d2df696.1471873308.git.bcodding@redhat.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Benjamin Coddington Aug. 22, 2016, 1:48 p.m. UTC
Change from v1:

Instead of moving the i_lock protected region around, store last written
byte for block layouts on struct pnfs_block_layout and use that when
encoding LAYOUTCOMMIT.

8<------------------------------------------------------------------------

Block/SCSI layout write completion may add committable extents to the
extent tree before updating the layout's last-written byte under the inode
lock.  If a sync happens before this value is updated, then
prepare_layoutcommit may find and encode these extents which would produce
a LAYOUTCOMMIT request whose encoded extents are larger than the request's
loca_length.

Fix this by using a last-written byte value that is updated atomically with
the extent tree.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
---
 fs/nfs/blocklayout/blocklayout.c | 2 +-
 fs/nfs/blocklayout/blocklayout.h | 3 ++-
 fs/nfs/blocklayout/extent_tree.c | 8 +++++---
 3 files changed, 8 insertions(+), 5 deletions(-)

Comments

Trond Myklebust Aug. 22, 2016, 3:34 p.m. UTC | #1
> On Aug 22, 2016, at 09:48, Benjamin Coddington <bcodding@redhat.com> wrote:
> 
> Change from v1:
> 
> Instead of moving the i_lock protected region around, store last written
> byte for block layouts on struct pnfs_block_layout and use that when
> encoding LAYOUTCOMMIT.
> 
> 8<------------------------------------------------------------------------
> 
> Block/SCSI layout write completion may add committable extents to the
> extent tree before updating the layout's last-written byte under the inode
> lock.  If a sync happens before this value is updated, then
> prepare_layoutcommit may find and encode these extents which would produce
> a LAYOUTCOMMIT request whose encoded extents are larger than the request's
> loca_length.
> 
> Fix this by using a last-written byte value that is updated atomically with
> the extent tree.
> 
> Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
> ---
> fs/nfs/blocklayout/blocklayout.c | 2 +-
> fs/nfs/blocklayout/blocklayout.h | 3 ++-
> fs/nfs/blocklayout/extent_tree.c | 8 +++++---
> 3 files changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 17a42e4eb872..25cdd559831c 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -344,7 +344,7 @@ static void bl_write_cleanup(struct work_struct *work)
> 			PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
> 
> 		ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
> -					(end - start) >> SECTOR_SHIFT);
> +					(end - start) >> SECTOR_SHIFT, end);
> 	}
> 
> 	pnfs_ld_write_done(hdr);
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index 18e6fd0b9506..efc007f00742 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -141,6 +141,7 @@ struct pnfs_block_layout {
> 	struct rb_root		bl_ext_ro;
> 	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
> 	bool			bl_scsi_layout;
> +	u64			bl_lwb;
> };
> 
> static inline struct pnfs_block_layout *
> @@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
> int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
> 		sector_t end);
> int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
> -		sector_t len);
> +		sector_t len, u64 lwb);
> bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
> 		struct pnfs_block_extent *ret, bool rw);
> int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
> diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
> index 992bcb19c11e..8d08dfe1e057 100644
> --- a/fs/nfs/blocklayout/extent_tree.c
> +++ b/fs/nfs/blocklayout/extent_tree.c
> @@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
> 
> int
> ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
> -		sector_t len)
> +		sector_t len, u64 lwb)
> {
> 	struct rb_root *root = &bl->bl_ext_rw;
> 	sector_t end = start + len;
> @@ -471,6 +471,7 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
> 		}
> 	}
> out:
> +	bl->bl_lwb = lwb;
> 	spin_unlock(&bl->bl_ext_lock);
> 
> 	__ext_put_deviceids(&tmp);
> @@ -518,7 +519,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
> }
> 
> static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
> -		size_t buffer_size, size_t *count)
> +		size_t buffer_size, size_t *count, __u64 *lastbyte)
> {
> 	struct pnfs_block_extent *be;
> 	int ret = 0;
> @@ -541,6 +542,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
> 		else
> 			p = encode_block_extent(be, p);
> 		be->be_tag = EXTENT_COMMITTING;
> +		*lastbyte = bl->bl_lwb - 1;
> 	}
> 	spin_unlock(&bl->bl_ext_lock);
> 
> @@ -564,7 +566,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
> 	arg->layoutupdate_pages = &arg->layoutupdate_page;
> 
> retry:
> -	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
> +	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
> 	if (unlikely(ret)) {
> 		ext_tree_free_commitdata(arg, buffer_size);
> 

That looks good. Thanks!

Trond

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Coddington Aug. 22, 2016, 4:22 p.m. UTC | #2
Arg, wait, this isn't right; it doesn't handle the case where a mid-file
write follows a write that would extend the file.  It needs to check if
bl_lwb < lwb before setting the value, but it also needs a way to clear 
it
for every cycle of NFS_INO_LAYOUTCOMMIT.

I'll try again.

Ben

On 22 Aug 2016, at 11:34, Trond Myklebust wrote:

>> On Aug 22, 2016, at 09:48, Benjamin Coddington <bcodding@redhat.com> 
>> wrote:
>>
>> Change from v1:
>>
>> Instead of moving the i_lock protected region around, store last 
>> written
>> byte for block layouts on struct pnfs_block_layout and use that when
>> encoding LAYOUTCOMMIT.
>>
>> 8<------------------------------------------------------------------------
>>
>> Block/SCSI layout write completion may add committable extents to the
>> extent tree before updating the layout's last-written byte under the 
>> inode
>> lock.  If a sync happens before this value is updated, then
>> prepare_layoutcommit may find and encode these extents which would 
>> produce
>> a LAYOUTCOMMIT request whose encoded extents are larger than the 
>> request's
>> loca_length.
>>
>> Fix this by using a last-written byte value that is updated 
>> atomically with
>> the extent tree.
>>
>> Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
>> ---
>> fs/nfs/blocklayout/blocklayout.c | 2 +-
>> fs/nfs/blocklayout/blocklayout.h | 3 ++-
>> fs/nfs/blocklayout/extent_tree.c | 8 +++++---
>> 3 files changed, 8 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/nfs/blocklayout/blocklayout.c 
>> b/fs/nfs/blocklayout/blocklayout.c
>> index 17a42e4eb872..25cdd559831c 100644
>> --- a/fs/nfs/blocklayout/blocklayout.c
>> +++ b/fs/nfs/blocklayout/blocklayout.c
>> @@ -344,7 +344,7 @@ static void bl_write_cleanup(struct work_struct 
>> *work)
>> 			PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
>>
>> 		ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
>> -					(end - start) >> SECTOR_SHIFT);
>> +					(end - start) >> SECTOR_SHIFT, end);
>> 	}
>>
>> 	pnfs_ld_write_done(hdr);
>> diff --git a/fs/nfs/blocklayout/blocklayout.h 
>> b/fs/nfs/blocklayout/blocklayout.h
>> index 18e6fd0b9506..efc007f00742 100644
>> --- a/fs/nfs/blocklayout/blocklayout.h
>> +++ b/fs/nfs/blocklayout/blocklayout.h
>> @@ -141,6 +141,7 @@ struct pnfs_block_layout {
>> 	struct rb_root		bl_ext_ro;
>> 	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
>> 	bool			bl_scsi_layout;
>> +	u64			bl_lwb;
>> };
>>
>> static inline struct pnfs_block_layout *
>> @@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
>> int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t 
>> start,
>> 		sector_t end);
>> int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t 
>> start,
>> -		sector_t len);
>> +		sector_t len, u64 lwb);
>> bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
>> 		struct pnfs_block_extent *ret, bool rw);
>> int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
>> diff --git a/fs/nfs/blocklayout/extent_tree.c 
>> b/fs/nfs/blocklayout/extent_tree.c
>> index 992bcb19c11e..8d08dfe1e057 100644
>> --- a/fs/nfs/blocklayout/extent_tree.c
>> +++ b/fs/nfs/blocklayout/extent_tree.c
>> @@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct 
>> pnfs_block_extent *be,
>>
>> int
>> ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
>> -		sector_t len)
>> +		sector_t len, u64 lwb)
>> {
>> 	struct rb_root *root = &bl->bl_ext_rw;
>> 	sector_t end = start + len;
>> @@ -471,6 +471,7 @@ ext_tree_mark_written(struct pnfs_block_layout 
>> *bl, sector_t start,
>> 		}
>> 	}
>> out:
>> +	bl->bl_lwb = lwb;
>> 	spin_unlock(&bl->bl_ext_lock);
>>
>> 	__ext_put_deviceids(&tmp);
>> @@ -518,7 +519,7 @@ static __be32 *encode_scsi_range(struct 
>> pnfs_block_extent *be, __be32 *p)
>> }
>>
>> static int ext_tree_encode_commit(struct pnfs_block_layout *bl, 
>> __be32 *p,
>> -		size_t buffer_size, size_t *count)
>> +		size_t buffer_size, size_t *count, __u64 *lastbyte)
>> {
>> 	struct pnfs_block_extent *be;
>> 	int ret = 0;
>> @@ -541,6 +542,7 @@ static int ext_tree_encode_commit(struct 
>> pnfs_block_layout *bl, __be32 *p,
>> 		else
>> 			p = encode_block_extent(be, p);
>> 		be->be_tag = EXTENT_COMMITTING;
>> +		*lastbyte = bl->bl_lwb - 1;
>> 	}
>> 	spin_unlock(&bl->bl_ext_lock);
>>
>> @@ -564,7 +566,7 @@ ext_tree_prepare_commit(struct 
>> nfs4_layoutcommit_args *arg)
>> 	arg->layoutupdate_pages = &arg->layoutupdate_page;
>>
>> retry:
>> -	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
>> +	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, 
>> &arg->lastbytewritten);
>> 	if (unlikely(ret)) {
>> 		ext_tree_free_commitdata(arg, buffer_size);
>>
>
> That looks good. Thanks!
>
> Trond
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 17a42e4eb872..25cdd559831c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -344,7 +344,7 @@  static void bl_write_cleanup(struct work_struct *work)
 			PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
 
 		ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
-					(end - start) >> SECTOR_SHIFT);
+					(end - start) >> SECTOR_SHIFT, end);
 	}
 
 	pnfs_ld_write_done(hdr);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 18e6fd0b9506..efc007f00742 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -141,6 +141,7 @@  struct pnfs_block_layout {
 	struct rb_root		bl_ext_ro;
 	spinlock_t		bl_ext_lock;   /* Protects list manipulation */
 	bool			bl_scsi_layout;
+	u64			bl_lwb;
 };
 
 static inline struct pnfs_block_layout *
@@ -182,7 +183,7 @@  int ext_tree_insert(struct pnfs_block_layout *bl,
 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
 		sector_t end);
 int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
-		sector_t len);
+		sector_t len, u64 lwb);
 bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
 		struct pnfs_block_extent *ret, bool rw);
 int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 992bcb19c11e..8d08dfe1e057 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -402,7 +402,7 @@  ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
 
 int
 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
-		sector_t len)
+		sector_t len, u64 lwb)
 {
 	struct rb_root *root = &bl->bl_ext_rw;
 	sector_t end = start + len;
@@ -471,6 +471,7 @@  ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
 		}
 	}
 out:
+	bl->bl_lwb = lwb;
 	spin_unlock(&bl->bl_ext_lock);
 
 	__ext_put_deviceids(&tmp);
@@ -518,7 +519,7 @@  static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
 }
 
 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
-		size_t buffer_size, size_t *count)
+		size_t buffer_size, size_t *count, __u64 *lastbyte)
 {
 	struct pnfs_block_extent *be;
 	int ret = 0;
@@ -541,6 +542,7 @@  static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
 		else
 			p = encode_block_extent(be, p);
 		be->be_tag = EXTENT_COMMITTING;
+		*lastbyte = bl->bl_lwb - 1;
 	}
 	spin_unlock(&bl->bl_ext_lock);
 
@@ -564,7 +566,7 @@  ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
 	arg->layoutupdate_pages = &arg->layoutupdate_page;
 
 retry:
-	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+	ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
 	if (unlikely(ret)) {
 		ext_tree_free_commitdata(arg, buffer_size);