diff mbox series

[v3,2/2] 9p: use i_lock to protect update of inode->i_blocks under 32-bit

Message ID 20190124063514.8571-3-houtao1@huawei.com (mailing list archive)
State New, archived
Headers show
Series use i_lock to protect updates of i_blocks & | expand

Commit Message

Hou Tao Jan. 24, 2019, 6:35 a.m. UTC
When v9fs_stat2inode() is invoked concurrently under 32-bit case
and CONFIG_LBDAF is enabled, multiple updates of inode->i_blocks
may corrupt the value of i_blocks because the assignment of 64-bit
value under 32-bit host is not atomic.

Fix it by using i_lock to protect update of inode->i_blocks. Also
Skip the update when it's not requested.

Suggested-by: Dominique Martinet <asmadeus@codewreck.org>
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 fs/9p/v9fs_vfs.h       | 10 ++++++++++
 fs/9p/vfs_inode.c      |  7 ++++---
 fs/9p/vfs_inode_dotl.c | 16 +++++++++-------
 3 files changed, 23 insertions(+), 10 deletions(-)

Comments

Dominique Martinet Jan. 24, 2019, 7:25 a.m. UTC | #1
Hou Tao wrote on Thu, Jan 24, 2019:
> When v9fs_stat2inode() is invoked concurrently under 32-bit case
> and CONFIG_LBDAF is enabled, multiple updates of inode->i_blocks
> may corrupt the value of i_blocks because the assignment of 64-bit
> value under 32-bit host is not atomic.
> 
> Fix it by using i_lock to protect update of inode->i_blocks. Also
> Skip the update when it's not requested.
> 
> Suggested-by: Dominique Martinet <asmadeus@codewreck.org>
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  fs/9p/v9fs_vfs.h       | 10 ++++++++++
>  fs/9p/vfs_inode.c      |  7 ++++---
>  fs/9p/vfs_inode_dotl.c | 16 +++++++++-------
>  3 files changed, 23 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
> index 3d371b9e461a..0bd4acfdb6af 100644
> --- a/fs/9p/v9fs_vfs.h
> +++ b/fs/9p/v9fs_vfs.h
> @@ -101,4 +101,14 @@ static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
>  	if (sizeof(i_size) > sizeof(long))
>  		spin_unlock(&inode->i_lock);
>  }
> +
> +static inline void v9fs_i_blocks_write(struct inode *inode, blkcnt_t i_blocks)
> +{
> +	/* Avoid taking i_lock under 64-bits */
> +	if (sizeof(i_blocks) > sizeof(long))
> +		spin_lock(&inode->i_lock);
> +	inode->i_blocks = i_blocks;
> +	if (sizeof(i_blocks) > sizeof(long))
> +		spin_unlock(&inode->i_lock);
> +}
>  #endif
> diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
> index 72b779bc0942..f05ff3cfbfe7 100644
> --- a/fs/9p/vfs_inode.c
> +++ b/fs/9p/vfs_inode.c
> @@ -1218,10 +1218,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
>  	mode |= inode->i_mode & ~S_IALLUGO;
>  	inode->i_mode = mode;
>  
> -	if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
> +	if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) {
>  		v9fs_i_size_write(inode, stat->length);
> -	/* not real number of blocks, but 512 byte ones ... */
> -	inode->i_blocks = (stat->length + 512 - 1) >> 9;
> +		/* not real number of blocks, but 512 byte ones ... */
> +		v9fs_i_blocks_write(inode, (stat->length + 512 - 1) >> 9);

hmm, part of me wants to abuse v9fs_i_size_write to also update i_blocks
as I think we'd usually want to keep both in sync.

In v9fs_file_write_iter we do not update i_blocks directly but
inode_add_bytes basically does the same thing... Speaking of which it's
assuming inode->i_bytes is set correctly initially; I _think_ we're
supposed to call inode_set_bytes() instead of writing i_blocks directly
in stat2inode?
Sorry, I'm not too familiar with this code and there doesn't seem to be
many users of inode_set_bytes so it's a bit puzzling to me.

I'm not using cache mode much but I have some very weird behaviour
currently if I mount something with cache=fscache :
 - if I create a new file from 9p and stat on the same client, a size of
 0-511 has block = 0 then it's just always off by 1
 - if I create a file outside of 9p and stat it, for example starting
 with 400 bytes, it starts with 8 blocks then the number of blocks
 increase by 1 everytime 512 bytes are written regardless of initial
 size.

Let's try to go back to some decent behaviour there first, and then
we can decide if it still make sense to separate v9fs_i_blocks_write and
v9fs_i_size_write or not.
Hou Tao Jan. 24, 2019, 2:03 p.m. UTC | #2
Hi,

On 2019/1/24 15:25, Dominique Martinet wrote:
> Hou Tao wrote on Thu, Jan 24, 2019:
>> When v9fs_stat2inode() is invoked concurrently under 32-bit case
>> and CONFIG_LBDAF is enabled, multiple updates of inode->i_blocks
>> may corrupt the value of i_blocks because the assignment of 64-bit
>> value under 32-bit host is not atomic.
>>
>> Fix it by using i_lock to protect update of inode->i_blocks. Also
>> Skip the update when it's not requested.
>>
>> Suggested-by: Dominique Martinet <asmadeus@codewreck.org>
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>>  fs/9p/v9fs_vfs.h       | 10 ++++++++++
>>  fs/9p/vfs_inode.c      |  7 ++++---
>>  fs/9p/vfs_inode_dotl.c | 16 +++++++++-------
>>  3 files changed, 23 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
>> index 3d371b9e461a..0bd4acfdb6af 100644
>> --- a/fs/9p/v9fs_vfs.h
>> +++ b/fs/9p/v9fs_vfs.h
>> @@ -101,4 +101,14 @@ static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
>>  	if (sizeof(i_size) > sizeof(long))
>>  		spin_unlock(&inode->i_lock);
>>  }
>> +
>> +static inline void v9fs_i_blocks_write(struct inode *inode, blkcnt_t i_blocks)
>> +{
>> +	/* Avoid taking i_lock under 64-bits */
>> +	if (sizeof(i_blocks) > sizeof(long))
>> +		spin_lock(&inode->i_lock);
>> +	inode->i_blocks = i_blocks;
>> +	if (sizeof(i_blocks) > sizeof(long))
>> +		spin_unlock(&inode->i_lock);
>> +}
>>  #endif
>> diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
>> index 72b779bc0942..f05ff3cfbfe7 100644
>> --- a/fs/9p/vfs_inode.c
>> +++ b/fs/9p/vfs_inode.c
>> @@ -1218,10 +1218,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
>>  	mode |= inode->i_mode & ~S_IALLUGO;
>>  	inode->i_mode = mode;
>>  
>> -	if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
>> +	if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) {
>>  		v9fs_i_size_write(inode, stat->length);
>> -	/* not real number of blocks, but 512 byte ones ... */
>> -	inode->i_blocks = (stat->length + 512 - 1) >> 9;
>> +		/* not real number of blocks, but 512 byte ones ... */
>> +		v9fs_i_blocks_write(inode, (stat->length + 512 - 1) >> 9);
> 
> hmm, part of me wants to abuse v9fs_i_size_write to also update i_blocks
> as I think we'd usually want to keep both in sync.We can do that for v9fs_stat2inode(), but for v9fs_stat2inode_dotl()
the updates of i_block & i_size are controlled by different flags, so
separated updates are needed (maybe also add an extra flags ?).

> 
> In v9fs_file_write_iter we do not update i_blocks directly but
> inode_add_bytes basically does the same thing... Speaking of which it's
> assuming inode->i_bytes is set correctly initially; I _think_ we're
> supposed to call inode_set_bytes() instead of writing i_blocks directly
> in stat2inode?
> Sorry, I'm not too familiar with this code and there doesn't seem to be
> many users of inode_set_bytes so it's a bit puzzling to me.
> 
In my understanding, now only disk quota uses i_bytes, so for 9p there is not so
much different between updating i_blocks directly and using inode_set_bytes(),
but maybe using inode_set_bytes() will be appropriate.


> I'm not using cache mode much but I have some very weird behaviour
> currently if I mount something with cache=fscache :
>  - if I create a new file from 9p and stat on the same client, a size of
>  0-511 has block = 0 then it's just always off by 1
Is this not the expected result ? v9fs_write_end() will call inode_add_bytes(inode,511)
if we write 511 bytes to the new file and i_blocks will be 0.

>  - if I create a file outside of 9p and stat it, for example starting
>  with 400 bytes, it starts with 8 blocks then the number of blocks
>  increase by 1 everytime 512 bytes are written regardless of initial
>  size.
> 
The initial i_blocks comes from the underlying filesystem, so I think 8 blocks
just mean the block size of the underlying filesystem is 4KB. If another 512 bytes
are written into the file, inode_add_bytes() will increase i_blocks by one instead of
fetching i_blocks from server, though i_blocks in server side will still be 8. Maybe
we should call v9fs_invalidate_inode_attr() in v9fs_write_end() ?

> Let's try to go back to some decent behaviour there first, and then
> we can decide if it still make sense to separate v9fs_i_blocks_write and
> v9fs_i_size_write or not.
> 
I don't follow that. Could you elaborate ?

Regards,
Tao
Dominique Martinet Jan. 24, 2019, 2:45 p.m. UTC | #3
Hou Tao wrote on Thu, Jan 24, 2019:
> > hmm, part of me wants to abuse v9fs_i_size_write to also update i_blocks
> > as I think we'd usually want to keep both in sync.We can do that for
> > v9fs_stat2inode(), but for v9fs_stat2inode_dotl()
>
> the updates of i_block & i_size are controlled by different flags, so
> separated updates are needed (maybe also add an extra flags ?).

Is there any case where i_block and i_size are not updated together?
I would be happy with an extra flag if required, I just do not see where
right now (admitedly didn't look for long, will have a second look
tomorrow)


> > In v9fs_file_write_iter we do not update i_blocks directly but
> > inode_add_bytes basically does the same thing... Speaking of which it's
> > assuming inode->i_bytes is set correctly initially; I _think_ we're
> > supposed to call inode_set_bytes() instead of writing i_blocks directly
> > in stat2inode?
> > Sorry, I'm not too familiar with this code and there doesn't seem to be
> > many users of inode_set_bytes so it's a bit puzzling to me.
>
> In my understanding, now only disk quota uses i_bytes, so for 9p there is not so
> much different between updating i_blocks directly and using inode_set_bytes(),
> but maybe using inode_set_bytes() will be appropriate.

I also think only quota uses i_bytes directly, but 9p also uses it
indirectly through inode_add_bytes() - i_blocks is incremented by 1
every 512 bytes and i_bytes is reduced to its last few bits (basically
%512)


> > I'm not using cache mode much but I have some very weird behaviour
> > currently if I mount something with cache=fscache :
> >  - if I create a new file from 9p and stat on the same client, a size of
> >  0-511 has block = 0 then it's just always off by 1

> Is this not the expected result ? v9fs_write_end() will call
> inode_add_bytes(inode,511) if we write 511 bytes to the new file and
> i_blocks will be 0.

That is the obvious result from the current code, yes - but it
definitely is wrong.
i_blocks should be 1 from byte 1 onwards until 512, and bumped to 2 at
513. From a filesystem point of view, a block is already "consumed" from
the first byte onward.
(This isn't what inode_set_bytes does..)


In practice, this actually is fairly important - tools like tar are
optimized such that if a file has no blocks it is considered sparse and
isn't read at all, so this could lead to data loss.
Interestingly I cannot reproduce with a recent gnu tar but I am sure I
have seen the behaviour (some lustre bug[1] talks about this as well and
the code is still present[2]); we should not report a st_blocks of 0 is
there is data -- v9fs_stat2inode (not v9fs_stat2inode_dotl) also rounds
up i_blocks so this is the right behaviour, we're just not doing it when
creating new files apparently.

[1] https://jira.whamcloud.com/browse/LU-3864
[2] http://git.savannah.gnu.org/cgit/tar.git/tree/src/sparse.c#n273


If this optimization didn't exist and given i_bytes existence it does
seem more correct to do like inode_set_bytes (e.g. blocks = bytes / 512
and bytes = bytes % 512) but this really strikes me as odd; this might
be why hardly anyone uses this...


> >  - if I create a file outside of 9p and stat it, for example starting
> >  with 400 bytes, it starts with 8 blocks then the number of blocks
> >  increase by 1 everytime 512 bytes are written regardless of initial
> >  size.
>
> The initial i_blocks comes from the underlying filesystem, so I think 8 blocks
> just mean the block size of the underlying filesystem is 4KB. If another 512 bytes
> are written into the file, inode_add_bytes() will increase i_blocks by one instead of
> fetching i_blocks from server, though i_blocks in server side will
> still be 8.

Ah, I had only looked at v9fs_stat2inode() which creates the value from
i_size, not v9fs_stat2inode_dotl().
v9fs_stat2inode_dotl() does take the value from the RPC, and then 8 is
indeed correct -- it will just be very painful to keep synchronized with
the underlying fs; we have no ways of knowing when the remote filesystem
decides to allocate new blocks.
If for example the write iter only writes 0 and the underlying fs is
optimized st_blocks might not increase at all.

> Maybe we should call v9fs_invalidate_inode_attr() in v9fs_write_end() ?

Hmm, That will probably add quite a lot of RPCs and make the cache
option rather useless...
Honestly as far as I'm concerned the value of i_blocks is mostly
informative (it's used by tools like du to give a size estimate, and
that's about it?) - with the exception of the 0 value I was talking
earlier.

I'd advocate to never display the remote st_blocks but do like
v9fs_stat2inode() and always just set blocks appropriately from the
size (i.e. i_blocks = (i_size + 512 - 1) >> 9 and i_bytes = 0), but I'd
like some opinions first.
I'll send a mail to fsdevel asking vfs folks for advices tomorrow; let's
put this patch on hold for a few days.


Thanks,
Dominique Martinet Jan. 24, 2019, 2:53 p.m. UTC | #4
Dominique Martinet wrote on Thu, Jan 24, 2019:
> > >  - if I create a file outside of 9p and stat it, for example starting
> > >  with 400 bytes, it starts with 8 blocks then the number of blocks
> > >  increase by 1 everytime 512 bytes are written regardless of initial
> > >  size.
> >
> > The initial i_blocks comes from the underlying filesystem, so I think 8 blocks
> > just mean the block size of the underlying filesystem is 4KB. If another 512 bytes
> > are written into the file, inode_add_bytes() will increase i_blocks by one instead of
> > fetching i_blocks from server, though i_blocks in server side will
> > still be 8.
> 
> Ah, I had only looked at v9fs_stat2inode() which creates the value from
> i_size, not v9fs_stat2inode_dotl().
> v9fs_stat2inode_dotl() does take the value from the RPC, and then 8 is
> indeed correct -- it will just be very painful to keep synchronized with
> the underlying fs; we have no ways of knowing when the remote filesystem
> decides to allocate new blocks.
> If for example the write iter only writes 0 and the underlying fs is
> optimized st_blocks might not increase at all.
> 
> > Maybe we should call v9fs_invalidate_inode_attr() in v9fs_write_end() ?
> 
> Hmm, That will probably add quite a lot of RPCs and make the cache
> option rather useless...

Actually I made an interesting point without realizing it myself -
thinking back of the option, I would keep the current behaviour with
cache=none/mmap (as the inode attr will be invalid and refreshed on
every stat anyway - this incidentally probably is why I never noticed
it, cache=none works just fine as far as st_blocks is concerned because
it always returns the underlying storage's block count) and just 100%
fake it for cache=fscache/loose.

Still going to bring this whole subject up in a separate thread to
fsdevel tomorrow, I'll put you and the v9fs-developer list in cc.


Will also add a note to run more testings with various cache
options...
diff mbox series

Patch

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 3d371b9e461a..0bd4acfdb6af 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -101,4 +101,14 @@  static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
 	if (sizeof(i_size) > sizeof(long))
 		spin_unlock(&inode->i_lock);
 }
+
+static inline void v9fs_i_blocks_write(struct inode *inode, blkcnt_t i_blocks)
+{
+	/* Avoid taking i_lock under 64-bits */
+	if (sizeof(i_blocks) > sizeof(long))
+		spin_lock(&inode->i_lock);
+	inode->i_blocks = i_blocks;
+	if (sizeof(i_blocks) > sizeof(long))
+		spin_unlock(&inode->i_lock);
+}
 #endif
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 72b779bc0942..f05ff3cfbfe7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1218,10 +1218,11 @@  v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	mode |= inode->i_mode & ~S_IALLUGO;
 	inode->i_mode = mode;
 
-	if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+	if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) {
 		v9fs_i_size_write(inode, stat->length);
-	/* not real number of blocks, but 512 byte ones ... */
-	inode->i_blocks = (stat->length + 512 - 1) >> 9;
+		/* not real number of blocks, but 512 byte ones ... */
+		v9fs_i_blocks_write(inode, (stat->length + 512 - 1) >> 9);
+	}
 	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a950a927a626..c1e2416d83c1 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -633,9 +633,10 @@  v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
 		mode |= inode->i_mode & ~S_IALLUGO;
 		inode->i_mode = mode;
 
-		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) {
 			v9fs_i_size_write(inode, stat->st_size);
-		inode->i_blocks = stat->st_blocks;
+			v9fs_i_blocks_write(inode, stat->st_blocks);
+		}
 	} else {
 		if (stat->st_result_mask & P9_STATS_ATIME) {
 			inode->i_atime.tv_sec = stat->st_atime_sec;
@@ -664,11 +665,12 @@  v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
 		}
 		if (stat->st_result_mask & P9_STATS_RDEV)
 			inode->i_rdev = new_decode_dev(stat->st_rdev);
-		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
-		    stat->st_result_mask & P9_STATS_SIZE)
-			v9fs_i_size_write(inode, stat->st_size);
-		if (stat->st_result_mask & P9_STATS_BLOCKS)
-			inode->i_blocks = stat->st_blocks;
+		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) {
+			if (stat->st_result_mask & P9_STATS_SIZE)
+				v9fs_i_size_write(inode, stat->st_size);
+			if (stat->st_result_mask & P9_STATS_BLOCKS)
+				v9fs_i_blocks_write(inode, stat->st_blocks);
+		}
 	}
 	if (stat->st_result_mask & P9_STATS_GEN)
 		inode->i_generation = stat->st_gen;