diff mbox series

[f2fs-dev,v2] f2fs: fix to avoid racing in between read and OPU dio write

Message ID 20240510023906.281700-1-chao@kernel.org (mailing list archive)
State New
Headers show
Series [f2fs-dev,v2] f2fs: fix to avoid racing in between read and OPU dio write | expand

Commit Message

Chao Yu May 10, 2024, 2:39 a.m. UTC
If lfs mode is on, buffered read may race w/ OPU dio write as below,
it may cause buffered read hits unwritten data unexpectly, and for
dio read, the race condition exists as well.

Thread A			Thread B
- f2fs_file_write_iter
 - f2fs_dio_write_iter
  - __iomap_dio_rw
   - f2fs_iomap_begin
    - f2fs_map_blocks
     - __allocate_data_block
      - allocated blkaddr #x
       - iomap_dio_submit_bio
				- f2fs_file_read_iter
				 - filemap_read
				  - f2fs_read_data_folio
				   - f2fs_mpage_readpages
				    - f2fs_map_blocks
				     : get blkaddr #x
				    - f2fs_submit_read_bio
				IRQ
				- f2fs_read_end_io
				 : read IO on blkaddr #x complete
IRQ
- iomap_dio_bio_end_io
 : direct write IO on blkaddr #x complete

This patch introduces a new per-inode i_opu_rwsem lock to avoid
such race condition.

Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
Signed-off-by: Chao Yu <chao@kernel.org>
---
v2:
- fix to cover dio read path w/ i_opu_rwsem as well.
 fs/f2fs/f2fs.h  |  1 +
 fs/f2fs/file.c  | 28 ++++++++++++++++++++++++++--
 fs/f2fs/super.c |  1 +
 3 files changed, 28 insertions(+), 2 deletions(-)

Comments

Jaegeuk Kim May 14, 2024, 4:09 p.m. UTC | #1
On 05/10, Chao Yu wrote:
> If lfs mode is on, buffered read may race w/ OPU dio write as below,
> it may cause buffered read hits unwritten data unexpectly, and for
> dio read, the race condition exists as well.
> 
> Thread A			Thread B
> - f2fs_file_write_iter
>  - f2fs_dio_write_iter
>   - __iomap_dio_rw
>    - f2fs_iomap_begin
>     - f2fs_map_blocks
>      - __allocate_data_block
>       - allocated blkaddr #x
>        - iomap_dio_submit_bio
> 				- f2fs_file_read_iter
> 				 - filemap_read
> 				  - f2fs_read_data_folio
> 				   - f2fs_mpage_readpages
> 				    - f2fs_map_blocks
> 				     : get blkaddr #x
> 				    - f2fs_submit_read_bio
> 				IRQ
> 				- f2fs_read_end_io
> 				 : read IO on blkaddr #x complete
> IRQ
> - iomap_dio_bio_end_io
>  : direct write IO on blkaddr #x complete
> 
> This patch introduces a new per-inode i_opu_rwsem lock to avoid
> such race condition.

Wasn't this supposed to be managed by user-land?

> 
> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
> Signed-off-by: Chao Yu <chao@kernel.org>
> ---
> v2:
> - fix to cover dio read path w/ i_opu_rwsem as well.
>  fs/f2fs/f2fs.h  |  1 +
>  fs/f2fs/file.c  | 28 ++++++++++++++++++++++++++--
>  fs/f2fs/super.c |  1 +
>  3 files changed, 28 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 30058e16a5d0..91cf4b3d6bc6 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -847,6 +847,7 @@ struct f2fs_inode_info {
>  	/* avoid racing between foreground op and gc */
>  	struct f2fs_rwsem i_gc_rwsem[2];
>  	struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
> +	struct f2fs_rwsem i_opu_rwsem;	/* avoid racing between buf read and opu dio write */
>  
>  	int i_extra_isize;		/* size of extra space located in i_addr */
>  	kprojid_t i_projid;		/* id for project quota */
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 72ce1a522fb2..4ec260af321f 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	const loff_t pos = iocb->ki_pos;
>  	const size_t count = iov_iter_count(to);
>  	struct iomap_dio *dio;
> +	bool do_opu = f2fs_lfs_mode(sbi);
>  	ssize_t ret;
>  
>  	if (count == 0)
> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  			ret = -EAGAIN;
>  			goto out;
>  		}
> +		if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
> +			f2fs_up_read(&fi->i_gc_rwsem[READ]);
> +			ret = -EAGAIN;
> +			goto out;
> +		}
>  	} else {
>  		f2fs_down_read(&fi->i_gc_rwsem[READ]);
> +		f2fs_down_read(&fi->i_opu_rwsem);
>  	}
>  
>  	/*
> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  		ret = iomap_dio_complete(dio);
>  	}
>  
> +	f2fs_up_read(&fi->i_opu_rwsem);
>  	f2fs_up_read(&fi->i_gc_rwsem[READ]);
>  
>  	file_accessed(file);
> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	if (f2fs_should_use_dio(inode, iocb, to)) {
>  		ret = f2fs_dio_read_iter(iocb, to);
>  	} else {
> +		bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
> +
> +		if (do_opu)
> +			f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
>  		ret = filemap_read(iocb, to, 0);
> +		if (do_opu)
> +			f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
>  		if (ret > 0)
>  			f2fs_update_iostat(F2FS_I_SB(inode), inode,
>  						APP_BUFFERED_READ_IO, ret);
> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>  			ret = -EAGAIN;
>  			goto out;
>  		}
> +		if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
> +			f2fs_up_read(&fi->i_gc_rwsem[READ]);
> +			f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
> +			ret = -EAGAIN;
> +			goto out;
> +		}
>  	} else {
>  		ret = f2fs_convert_inline_inode(inode);
>  		if (ret)
>  			goto out;
>  
>  		f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
> -		if (do_opu)
> +		if (do_opu) {
>  			f2fs_down_read(&fi->i_gc_rwsem[READ]);
> +			f2fs_down_write(&fi->i_opu_rwsem);
> +		}
>  	}
>  
>  	/*
> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>  		ret = iomap_dio_complete(dio);
>  	}
>  
> -	if (do_opu)
> +	if (do_opu) {
> +		f2fs_up_write(&fi->i_opu_rwsem);
>  		f2fs_up_read(&fi->i_gc_rwsem[READ]);
> +	}
>  	f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>  
>  	if (ret < 0)
> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> index daf2c4dbe150..b4ed3b094366 100644
> --- a/fs/f2fs/super.c
> +++ b/fs/f2fs/super.c
> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
>  	init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
>  	init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
>  	init_f2fs_rwsem(&fi->i_xattr_sem);
> +	init_f2fs_rwsem(&fi->i_opu_rwsem);
>  
>  	/* Will be used by directory only */
>  	fi->i_dir_level = F2FS_SB(sb)->dir_level;
> -- 
> 2.40.1
Chao Yu May 15, 2024, 1:42 a.m. UTC | #2
On 2024/5/15 0:09, Jaegeuk Kim wrote:
> On 05/10, Chao Yu wrote:
>> If lfs mode is on, buffered read may race w/ OPU dio write as below,
>> it may cause buffered read hits unwritten data unexpectly, and for
>> dio read, the race condition exists as well.
>>
>> Thread A                      Thread B
>> - f2fs_file_write_iter
>>   - f2fs_dio_write_iter
>>    - __iomap_dio_rw
>>     - f2fs_iomap_begin
>>      - f2fs_map_blocks
>>       - __allocate_data_block
>>        - allocated blkaddr #x
>>         - iomap_dio_submit_bio
>>                                - f2fs_file_read_iter
>>                                 - filemap_read
>>                                  - f2fs_read_data_folio
>>                                   - f2fs_mpage_readpages
>>                                    - f2fs_map_blocks
>>                                     : get blkaddr #x
>>                                    - f2fs_submit_read_bio
>>                                IRQ
>>                                - f2fs_read_end_io
>>                                 : read IO on blkaddr #x complete
>> IRQ
>> - iomap_dio_bio_end_io
>>   : direct write IO on blkaddr #x complete
>>
>> This patch introduces a new per-inode i_opu_rwsem lock to avoid
>> such race condition.
> 
> Wasn't this supposed to be managed by user-land?

Actually, the test case is:

1. mount w/ lfs mode
2. touch file;
3. initialize file w/ 4k zeroed data; fsync;
4. continue triggering dio write 4k zeroed data to file;
5. and meanwhile, continue triggering buf/dio 4k read in file,
use md5sum to verify the 4k data;

It expects data is all zero, however it turned out it's not.

Thanks,

> 
>>
>> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
>> Signed-off-by: Chao Yu <chao@kernel.org>
>> ---
>> v2:
>> - fix to cover dio read path w/ i_opu_rwsem as well.
>>   fs/f2fs/f2fs.h  |  1 +
>>   fs/f2fs/file.c  | 28 ++++++++++++++++++++++++++--
>>   fs/f2fs/super.c |  1 +
>>   3 files changed, 28 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index 30058e16a5d0..91cf4b3d6bc6 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -847,6 +847,7 @@ struct f2fs_inode_info {
>>        /* avoid racing between foreground op and gc */
>>        struct f2fs_rwsem i_gc_rwsem[2];
>>        struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
>> +     struct f2fs_rwsem i_opu_rwsem;  /* avoid racing between buf read and opu dio write */
>>
>>        int i_extra_isize;              /* size of extra space located in i_addr */
>>        kprojid_t i_projid;             /* id for project quota */
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 72ce1a522fb2..4ec260af321f 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>        const loff_t pos = iocb->ki_pos;
>>        const size_t count = iov_iter_count(to);
>>        struct iomap_dio *dio;
>> +     bool do_opu = f2fs_lfs_mode(sbi);
>>        ssize_t ret;
>>
>>        if (count == 0)
>> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>                        ret = -EAGAIN;
>>                        goto out;
>>                }
>> +             if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
>> +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
>> +                     ret = -EAGAIN;
>> +                     goto out;
>> +             }
>>        } else {
>>                f2fs_down_read(&fi->i_gc_rwsem[READ]);
>> +             f2fs_down_read(&fi->i_opu_rwsem);
>>        }
>>
>>        /*
>> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>                ret = iomap_dio_complete(dio);
>>        }
>>
>> +     f2fs_up_read(&fi->i_opu_rwsem);
>>        f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>
>>        file_accessed(file);
>> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>        if (f2fs_should_use_dio(inode, iocb, to)) {
>>                ret = f2fs_dio_read_iter(iocb, to);
>>        } else {
>> +             bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
>> +
>> +             if (do_opu)
>> +                     f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
>>                ret = filemap_read(iocb, to, 0);
>> +             if (do_opu)
>> +                     f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
>>                if (ret > 0)
>>                        f2fs_update_iostat(F2FS_I_SB(inode), inode,
>>                                                APP_BUFFERED_READ_IO, ret);
>> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>                        ret = -EAGAIN;
>>                        goto out;
>>                }
>> +             if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
>> +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
>> +                     f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>> +                     ret = -EAGAIN;
>> +                     goto out;
>> +             }
>>        } else {
>>                ret = f2fs_convert_inline_inode(inode);
>>                if (ret)
>>                        goto out;
>>
>>                f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
>> -             if (do_opu)
>> +             if (do_opu) {
>>                        f2fs_down_read(&fi->i_gc_rwsem[READ]);
>> +                     f2fs_down_write(&fi->i_opu_rwsem);
>> +             }
>>        }
>>
>>        /*
>> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>                ret = iomap_dio_complete(dio);
>>        }
>>
>> -     if (do_opu)
>> +     if (do_opu) {
>> +             f2fs_up_write(&fi->i_opu_rwsem);
>>                f2fs_up_read(&fi->i_gc_rwsem[READ]);
>> +     }
>>        f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>
>>        if (ret < 0)
>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>> index daf2c4dbe150..b4ed3b094366 100644
>> --- a/fs/f2fs/super.c
>> +++ b/fs/f2fs/super.c
>> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
>>        init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
>>        init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
>>        init_f2fs_rwsem(&fi->i_xattr_sem);
>> +     init_f2fs_rwsem(&fi->i_opu_rwsem);
>>
>>        /* Will be used by directory only */
>>        fi->i_dir_level = F2FS_SB(sb)->dir_level;
>> --
>> 2.40.1
Jaegeuk Kim May 15, 2024, 4:42 a.m. UTC | #3
On 05/15, Chao Yu wrote:
> On 2024/5/15 0:09, Jaegeuk Kim wrote:
> > On 05/10, Chao Yu wrote:
> > > If lfs mode is on, buffered read may race w/ OPU dio write as below,
> > > it may cause buffered read hits unwritten data unexpectly, and for
> > > dio read, the race condition exists as well.
> > > 
> > > Thread A                      Thread B
> > > - f2fs_file_write_iter
> > >   - f2fs_dio_write_iter
> > >    - __iomap_dio_rw
> > >     - f2fs_iomap_begin
> > >      - f2fs_map_blocks
> > >       - __allocate_data_block
> > >        - allocated blkaddr #x
> > >         - iomap_dio_submit_bio
> > >                                - f2fs_file_read_iter
> > >                                 - filemap_read
> > >                                  - f2fs_read_data_folio
> > >                                   - f2fs_mpage_readpages
> > >                                    - f2fs_map_blocks
> > >                                     : get blkaddr #x
> > >                                    - f2fs_submit_read_bio
> > >                                IRQ
> > >                                - f2fs_read_end_io
> > >                                 : read IO on blkaddr #x complete
> > > IRQ
> > > - iomap_dio_bio_end_io
> > >   : direct write IO on blkaddr #x complete
> > > 
> > > This patch introduces a new per-inode i_opu_rwsem lock to avoid
> > > such race condition.
> > 
> > Wasn't this supposed to be managed by user-land?
> 
> Actually, the test case is:
> 
> 1. mount w/ lfs mode
> 2. touch file;
> 3. initialize file w/ 4k zeroed data; fsync;
> 4. continue triggering dio write 4k zeroed data to file;
> 5. and meanwhile, continue triggering buf/dio 4k read in file,
> use md5sum to verify the 4k data;
> 
> It expects data is all zero, however it turned out it's not.

Can we check outstanding write bios instead of abusing locks?

> 
> Thanks,
> 
> > 
> > > 
> > > Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
> > > Signed-off-by: Chao Yu <chao@kernel.org>
> > > ---
> > > v2:
> > > - fix to cover dio read path w/ i_opu_rwsem as well.
> > >   fs/f2fs/f2fs.h  |  1 +
> > >   fs/f2fs/file.c  | 28 ++++++++++++++++++++++++++--
> > >   fs/f2fs/super.c |  1 +
> > >   3 files changed, 28 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > > index 30058e16a5d0..91cf4b3d6bc6 100644
> > > --- a/fs/f2fs/f2fs.h
> > > +++ b/fs/f2fs/f2fs.h
> > > @@ -847,6 +847,7 @@ struct f2fs_inode_info {
> > >        /* avoid racing between foreground op and gc */
> > >        struct f2fs_rwsem i_gc_rwsem[2];
> > >        struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
> > > +     struct f2fs_rwsem i_opu_rwsem;  /* avoid racing between buf read and opu dio write */
> > > 
> > >        int i_extra_isize;              /* size of extra space located in i_addr */
> > >        kprojid_t i_projid;             /* id for project quota */
> > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > index 72ce1a522fb2..4ec260af321f 100644
> > > --- a/fs/f2fs/file.c
> > > +++ b/fs/f2fs/file.c
> > > @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >        const loff_t pos = iocb->ki_pos;
> > >        const size_t count = iov_iter_count(to);
> > >        struct iomap_dio *dio;
> > > +     bool do_opu = f2fs_lfs_mode(sbi);
> > >        ssize_t ret;
> > > 
> > >        if (count == 0)
> > > @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >                        ret = -EAGAIN;
> > >                        goto out;
> > >                }
> > > +             if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
> > > +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
> > > +                     ret = -EAGAIN;
> > > +                     goto out;
> > > +             }
> > >        } else {
> > >                f2fs_down_read(&fi->i_gc_rwsem[READ]);
> > > +             f2fs_down_read(&fi->i_opu_rwsem);
> > >        }
> > > 
> > >        /*
> > > @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >                ret = iomap_dio_complete(dio);
> > >        }
> > > 
> > > +     f2fs_up_read(&fi->i_opu_rwsem);
> > >        f2fs_up_read(&fi->i_gc_rwsem[READ]);
> > > 
> > >        file_accessed(file);
> > > @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >        if (f2fs_should_use_dio(inode, iocb, to)) {
> > >                ret = f2fs_dio_read_iter(iocb, to);
> > >        } else {
> > > +             bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
> > > +
> > > +             if (do_opu)
> > > +                     f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
> > >                ret = filemap_read(iocb, to, 0);
> > > +             if (do_opu)
> > > +                     f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
> > >                if (ret > 0)
> > >                        f2fs_update_iostat(F2FS_I_SB(inode), inode,
> > >                                                APP_BUFFERED_READ_IO, ret);
> > > @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
> > >                        ret = -EAGAIN;
> > >                        goto out;
> > >                }
> > > +             if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
> > > +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
> > > +                     f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
> > > +                     ret = -EAGAIN;
> > > +                     goto out;
> > > +             }
> > >        } else {
> > >                ret = f2fs_convert_inline_inode(inode);
> > >                if (ret)
> > >                        goto out;
> > > 
> > >                f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
> > > -             if (do_opu)
> > > +             if (do_opu) {
> > >                        f2fs_down_read(&fi->i_gc_rwsem[READ]);
> > > +                     f2fs_down_write(&fi->i_opu_rwsem);
> > > +             }
> > >        }
> > > 
> > >        /*
> > > @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
> > >                ret = iomap_dio_complete(dio);
> > >        }
> > > 
> > > -     if (do_opu)
> > > +     if (do_opu) {
> > > +             f2fs_up_write(&fi->i_opu_rwsem);
> > >                f2fs_up_read(&fi->i_gc_rwsem[READ]);
> > > +     }
> > >        f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
> > > 
> > >        if (ret < 0)
> > > diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> > > index daf2c4dbe150..b4ed3b094366 100644
> > > --- a/fs/f2fs/super.c
> > > +++ b/fs/f2fs/super.c
> > > @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
> > >        init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
> > >        init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
> > >        init_f2fs_rwsem(&fi->i_xattr_sem);
> > > +     init_f2fs_rwsem(&fi->i_opu_rwsem);
> > > 
> > >        /* Will be used by directory only */
> > >        fi->i_dir_level = F2FS_SB(sb)->dir_level;
> > > --
> > > 2.40.1
Chao Yu May 15, 2024, 6:38 a.m. UTC | #4
On 2024/5/15 12:42, Jaegeuk Kim wrote:
> On 05/15, Chao Yu wrote:
>> On 2024/5/15 0:09, Jaegeuk Kim wrote:
>>> On 05/10, Chao Yu wrote:
>>>> If lfs mode is on, buffered read may race w/ OPU dio write as below,
>>>> it may cause buffered read hits unwritten data unexpectly, and for
>>>> dio read, the race condition exists as well.
>>>>
>>>> Thread A                      Thread B
>>>> - f2fs_file_write_iter
>>>>    - f2fs_dio_write_iter
>>>>     - __iomap_dio_rw
>>>>      - f2fs_iomap_begin
>>>>       - f2fs_map_blocks
>>>>        - __allocate_data_block
>>>>         - allocated blkaddr #x
>>>>          - iomap_dio_submit_bio
>>>>                                 - f2fs_file_read_iter
>>>>                                  - filemap_read
>>>>                                   - f2fs_read_data_folio
>>>>                                    - f2fs_mpage_readpages
>>>>                                     - f2fs_map_blocks
>>>>                                      : get blkaddr #x
>>>>                                     - f2fs_submit_read_bio
>>>>                                 IRQ
>>>>                                 - f2fs_read_end_io
>>>>                                  : read IO on blkaddr #x complete
>>>> IRQ
>>>> - iomap_dio_bio_end_io
>>>>    : direct write IO on blkaddr #x complete
>>>>
>>>> This patch introduces a new per-inode i_opu_rwsem lock to avoid
>>>> such race condition.
>>>
>>> Wasn't this supposed to be managed by user-land?
>>
>> Actually, the test case is:
>>
>> 1. mount w/ lfs mode
>> 2. touch file;
>> 3. initialize file w/ 4k zeroed data; fsync;
>> 4. continue triggering dio write 4k zeroed data to file;
>> 5. and meanwhile, continue triggering buf/dio 4k read in file,
>> use md5sum to verify the 4k data;
>>
>> It expects data is all zero, however it turned out it's not.
> 
> Can we check outstanding write bios instead of abusing locks?

I didn't figure out a way to solve this w/o lock, due to:
- write bios can be issued after outstanding write bios check condition,
result in the race.
- once read() detects that there are outstanding write bios, we need to
delay read flow rather than fail it, right? It looks using a lock is more
proper here?

Any suggestion?

Thanks,

> 
>>
>> Thanks,
>>
>>>
>>>>
>>>> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
>>>> Signed-off-by: Chao Yu <chao@kernel.org>
>>>> ---
>>>> v2:
>>>> - fix to cover dio read path w/ i_opu_rwsem as well.
>>>>    fs/f2fs/f2fs.h  |  1 +
>>>>    fs/f2fs/file.c  | 28 ++++++++++++++++++++++++++--
>>>>    fs/f2fs/super.c |  1 +
>>>>    3 files changed, 28 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>>> index 30058e16a5d0..91cf4b3d6bc6 100644
>>>> --- a/fs/f2fs/f2fs.h
>>>> +++ b/fs/f2fs/f2fs.h
>>>> @@ -847,6 +847,7 @@ struct f2fs_inode_info {
>>>>         /* avoid racing between foreground op and gc */
>>>>         struct f2fs_rwsem i_gc_rwsem[2];
>>>>         struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
>>>> +     struct f2fs_rwsem i_opu_rwsem;  /* avoid racing between buf read and opu dio write */
>>>>
>>>>         int i_extra_isize;              /* size of extra space located in i_addr */
>>>>         kprojid_t i_projid;             /* id for project quota */
>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>> index 72ce1a522fb2..4ec260af321f 100644
>>>> --- a/fs/f2fs/file.c
>>>> +++ b/fs/f2fs/file.c
>>>> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>         const loff_t pos = iocb->ki_pos;
>>>>         const size_t count = iov_iter_count(to);
>>>>         struct iomap_dio *dio;
>>>> +     bool do_opu = f2fs_lfs_mode(sbi);
>>>>         ssize_t ret;
>>>>
>>>>         if (count == 0)
>>>> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>                         ret = -EAGAIN;
>>>>                         goto out;
>>>>                 }
>>>> +             if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
>>>> +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>> +                     ret = -EAGAIN;
>>>> +                     goto out;
>>>> +             }
>>>>         } else {
>>>>                 f2fs_down_read(&fi->i_gc_rwsem[READ]);
>>>> +             f2fs_down_read(&fi->i_opu_rwsem);
>>>>         }
>>>>
>>>>         /*
>>>> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>                 ret = iomap_dio_complete(dio);
>>>>         }
>>>>
>>>> +     f2fs_up_read(&fi->i_opu_rwsem);
>>>>         f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>
>>>>         file_accessed(file);
>>>> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>         if (f2fs_should_use_dio(inode, iocb, to)) {
>>>>                 ret = f2fs_dio_read_iter(iocb, to);
>>>>         } else {
>>>> +             bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
>>>> +
>>>> +             if (do_opu)
>>>> +                     f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
>>>>                 ret = filemap_read(iocb, to, 0);
>>>> +             if (do_opu)
>>>> +                     f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
>>>>                 if (ret > 0)
>>>>                         f2fs_update_iostat(F2FS_I_SB(inode), inode,
>>>>                                                 APP_BUFFERED_READ_IO, ret);
>>>> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>>>                         ret = -EAGAIN;
>>>>                         goto out;
>>>>                 }
>>>> +             if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
>>>> +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>> +                     f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>>> +                     ret = -EAGAIN;
>>>> +                     goto out;
>>>> +             }
>>>>         } else {
>>>>                 ret = f2fs_convert_inline_inode(inode);
>>>>                 if (ret)
>>>>                         goto out;
>>>>
>>>>                 f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
>>>> -             if (do_opu)
>>>> +             if (do_opu) {
>>>>                         f2fs_down_read(&fi->i_gc_rwsem[READ]);
>>>> +                     f2fs_down_write(&fi->i_opu_rwsem);
>>>> +             }
>>>>         }
>>>>
>>>>         /*
>>>> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>>>                 ret = iomap_dio_complete(dio);
>>>>         }
>>>>
>>>> -     if (do_opu)
>>>> +     if (do_opu) {
>>>> +             f2fs_up_write(&fi->i_opu_rwsem);
>>>>                 f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>> +     }
>>>>         f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>>>
>>>>         if (ret < 0)
>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>>> index daf2c4dbe150..b4ed3b094366 100644
>>>> --- a/fs/f2fs/super.c
>>>> +++ b/fs/f2fs/super.c
>>>> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
>>>>         init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
>>>>         init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
>>>>         init_f2fs_rwsem(&fi->i_xattr_sem);
>>>> +     init_f2fs_rwsem(&fi->i_opu_rwsem);
>>>>
>>>>         /* Will be used by directory only */
>>>>         fi->i_dir_level = F2FS_SB(sb)->dir_level;
>>>> --
>>>> 2.40.1
Wu Bo May 15, 2024, 8:32 a.m. UTC | #5
On Fri, May 10, 2024 at 10:39:06AM +0800, Chao Yu wrote:
> If lfs mode is on, buffered read may race w/ OPU dio write as below,
> it may cause buffered read hits unwritten data unexpectly, and for
> dio read, the race condition exists as well.
> 
> Thread A			Thread B
> - f2fs_file_write_iter
>  - f2fs_dio_write_iter
>   - __iomap_dio_rw
>    - f2fs_iomap_begin
>     - f2fs_map_blocks
>      - __allocate_data_block
>       - allocated blkaddr #x
>        - iomap_dio_submit_bio
> 				- f2fs_file_read_iter
> 				 - filemap_read
> 				  - f2fs_read_data_folio
> 				   - f2fs_mpage_readpages
> 				    - f2fs_map_blocks
> 				     : get blkaddr #x
> 				    - f2fs_submit_read_bio
> 				IRQ
> 				- f2fs_read_end_io
> 				 : read IO on blkaddr #x complete
> IRQ
> - iomap_dio_bio_end_io
>  : direct write IO on blkaddr #x complete
> 
Looks like every COW filesystem would meet this situation. What's the solution
of other FS?
> This patch introduces a new per-inode i_opu_rwsem lock to avoid
> such race condition.
>
Markus Elfring May 15, 2024, 8:40 a.m. UTC | #6
> This patch introduces a new …

Please choose a corresponding imperative wording for an improved change description.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.9#n94

Regards,
Markus
kernel test robot May 17, 2024, 8:15 a.m. UTC | #7
Hello,

kernel test robot noticed "WARNING:at_kernel/locking/rwsem.c:#down_read" on:

commit: abf7df61e5c60fed520a09102d576fd41ed4d5ee ("[PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write")
url: https://github.com/intel-lab-lkp/linux/commits/Chao-Yu/f2fs-fix-to-avoid-racing-in-between-read-and-OPU-dio-write/20240510-104020
base: https://git.kernel.org/cgit/linux/kernel/git/jaegeuk/f2fs.git dev
patch link: https://lore.kernel.org/all/20240510023906.281700-1-chao@kernel.org/
patch subject: [PATCH v2] f2fs: fix to avoid racing in between read and OPU dio write

in testcase: xfstests
version: xfstests-x86_64-0e5c12df-1_20240511
with following parameters:

	disk: 4HDD
	fs: f2fs
	test: generic-617



compiler: gcc-13
test machine: 4 threads Intel(R) Core(TM) i5-6500 CPU @ 3.20GHz (Skylake) with 32G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202405171532.760e22d3-oliver.sang@intel.com


[  160.985543][ T2255] ------------[ cut here ]------------
[ 160.990864][ T2255] WARNING: CPU: 3 PID: 2255 at kernel/locking/rwsem.c:245 down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1)) 
[  160.999715][ T2255] Modules linked in: f2fs crc32_generic intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal btrfs intel_powerclamp blake2b_generic coretemp xor zstd_compress kvm_intel raid6_pq libcrc32c i915 kvm sd_mod t10_pi crc64_rocksoft_generic crct10dif_pclmul crc32_pclmul crc64_rocksoft crc64 crc32c_intel ghash_clmulni_intel sha512_ssse3 sg drm_buddy rapl ipmi_devintf intel_cstate intel_gtt ipmi_msghandler mei_wdt wmi_bmof intel_uncore i2c_i801 drm_display_helper i2c_smbus ahci ttm drm_kms_helper libahci video libata mei_me mei acpi_pad intel_pch_thermal wmi binfmt_misc loop fuse drm dm_mod ip_tables
[  161.053654][ T2255] CPU: 3 PID: 2255 Comm: fsx Tainted: G          I        6.9.0-rc1-00036-gabf7df61e5c6 #1
[  161.063438][ T2255] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.1.1 10/07/2015
[ 161.071504][ T2255] RIP: 0010:down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1)) 
[ 161.076376][ T2255] Code: b8 00 00 00 00 00 fc ff df 83 e3 02 48 c1 ea 03 4c 09 fb 48 83 cb 01 80 3c 02 00 0f 85 9a 00 00 00 48 89 5d 08 e9 50 ff ff ff <0f> 0b 4c 8d 7d 08 be 08 00 00 00 4c 89 ff e8 c1 59 e9 fd 4c 89 f8
All code
========
   0:	b8 00 00 00 00       	mov    $0x0,%eax
   5:	00 fc                	add    %bh,%ah
   7:	ff                   	(bad)  
   8:	df 83 e3 02 48 c1    	filds  -0x3eb7fd1d(%rbx)
   e:	ea                   	(bad)  
   f:	03 4c 09 fb          	add    -0x5(%rcx,%rcx,1),%ecx
  13:	48 83 cb 01          	or     $0x1,%rbx
  17:	80 3c 02 00          	cmpb   $0x0,(%rdx,%rax,1)
  1b:	0f 85 9a 00 00 00    	jne    0xbb
  21:	48 89 5d 08          	mov    %rbx,0x8(%rbp)
  25:	e9 50 ff ff ff       	jmpq   0xffffffffffffff7a
  2a:*	0f 0b                	ud2    		<-- trapping instruction
  2c:	4c 8d 7d 08          	lea    0x8(%rbp),%r15
  30:	be 08 00 00 00       	mov    $0x8,%esi
  35:	4c 89 ff             	mov    %r15,%rdi
  38:	e8 c1 59 e9 fd       	callq  0xfffffffffde959fe
  3d:	4c 89 f8             	mov    %r15,%rax

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2    
   2:	4c 8d 7d 08          	lea    0x8(%rbp),%r15
   6:	be 08 00 00 00       	mov    $0x8,%esi
   b:	4c 89 ff             	mov    %r15,%rdi
   e:	e8 c1 59 e9 fd       	callq  0xfffffffffde959d4
  13:	4c 89 f8             	mov    %r15,%rax
[  161.095753][ T2255] RSP: 0018:ffffc90002c0f8b8 EFLAGS: 00010286
[  161.101662][ T2255] RAX: fffffffffffffe00 RBX: fffffffffffffe00 RCX: ffffffff83bdf543
[  161.109469][ T2255] RDX: ffffed10236632f0 RSI: 0000000000000008 RDI: ffff88811b319778
[  161.117276][ T2255] RBP: ffff88811b319778 R08: 0000000000000001 R09: ffffed10236632ef
[  161.125080][ T2255] R10: ffff88811b31977f R11: ffffffff85fe96a2 R12: 1ffff92000581f18
[  161.132885][ T2255] R13: dffffc0000000000 R14: ffffc90002c0fa70 R15: 0000000000000000
[  161.140691][ T2255] FS:  00007f623f19d740(0000) GS:ffff8887e9380000(0000) knlGS:0000000000000000
[  161.149446][ T2255] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  161.155871][ T2255] CR2: 00007f623f05b000 CR3: 00000001b2026006 CR4: 00000000003706f0
[  161.163705][ T2255] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  161.171512][ T2255] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  161.179316][ T2255] Call Trace:
[  161.182456][ T2255]  <TASK>
[ 161.185254][ T2255] ? __warn (kernel/panic.c:694) 
[ 161.189181][ T2255] ? down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1)) 
[ 161.193456][ T2255] ? report_bug (lib/bug.c:180 lib/bug.c:219) 
[ 161.197814][ T2255] ? handle_bug (arch/x86/kernel/traps.c:239 (discriminator 1)) 
[ 161.202003][ T2255] ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1)) 
[ 161.206532][ T2255] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621) 
[ 161.211414][ T2255] ? down_read (arch/x86/include/asm/atomic64_64.h:79 (discriminator 5) include/linux/atomic/atomic-arch-fallback.h:2723 (discriminator 5) include/linux/atomic/atomic-long.h:163 (discriminator 5) include/linux/atomic/atomic-instrumented.h:3298 (discriminator 5) kernel/locking/rwsem.c:243 (discriminator 5) kernel/locking/rwsem.c:1249 (discriminator 5) kernel/locking/rwsem.c:1263 (discriminator 5) kernel/locking/rwsem.c:1528 (discriminator 5)) 
[ 161.215600][ T2255] ? down_read (kernel/locking/rwsem.c:245 (discriminator 1) kernel/locking/rwsem.c:1249 (discriminator 1) kernel/locking/rwsem.c:1263 (discriminator 1) kernel/locking/rwsem.c:1528 (discriminator 1)) 
[ 161.219854][ T2255] ? down_read (arch/x86/include/asm/atomic64_64.h:79 (discriminator 5) include/linux/atomic/atomic-arch-fallback.h:2723 (discriminator 5) include/linux/atomic/atomic-long.h:163 (discriminator 5) include/linux/atomic/atomic-instrumented.h:3298 (discriminator 5) kernel/locking/rwsem.c:243 (discriminator 5) kernel/locking/rwsem.c:1249 (discriminator 5) kernel/locking/rwsem.c:1263 (discriminator 5) kernel/locking/rwsem.c:1528 (discriminator 5)) 
[ 161.224034][ T2255] ? __rmqueue_pcplist (include/linux/list.h:215 (discriminator 1) include/linux/list.h:229 (discriminator 1) mm/page_alloc.c:2836 (discriminator 1)) 
[ 161.228904][ T2255] ? __pfx_down_read (kernel/locking/rwsem.c:1524) 
[ 161.233566][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152) 
[ 161.238707][ T2255] f2fs_dio_read_iter (fs/f2fs/f2fs.h:2468 (discriminator 1) fs/f2fs/file.c:4477 (discriminator 1)) f2fs
[ 161.244288][ T2255] f2fs_file_read_iter (fs/f2fs/file.c:4533) f2fs
[ 161.249949][ T2255] copy_splice_read (include/linux/fs.h:2102 fs/splice.c:365) 
[ 161.254636][ T2255] ? __pfx_copy_splice_read (fs/splice.c:324) 
[ 161.259843][ T2255] splice_direct_to_actor (fs/splice.c:1136) 
[ 161.265045][ T2255] ? __pfx_direct_splice_actor (fs/splice.c:1159) 
[ 161.270507][ T2255] ? __pfx_splice_direct_to_actor (fs/splice.c:1032) 
[ 161.276229][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152) 
[ 161.281359][ T2255] ? from_kgid_munged (kernel/user_namespace.c:527) 
[ 161.286131][ T2255] do_splice_direct (fs/splice.c:1208 fs/splice.c:1233) 
[ 161.290829][ T2255] ? __pfx_do_splice_direct (fs/splice.c:1232) 
[ 161.296047][ T2255] ? __pfx_direct_file_splice_eof (fs/splice.c:1178) 
[ 161.301779][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152) 
[ 161.306894][ T2255] ? rw_verify_area (fs/read_write.c:377) 
[ 161.311507][ T2255] vfs_copy_file_range (fs/read_write.c:1558) 
[ 161.316552][ T2255] ? f2fs_getattr (arch/x86/include/asm/bitops.h:206 (discriminator 1) arch/x86/include/asm/bitops.h:238 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:142 (discriminator 1) fs/f2fs/f2fs.h:3056 (discriminator 1) fs/f2fs/f2fs.h:3311 (discriminator 1) fs/f2fs/file.c:904 (discriminator 1)) f2fs
[ 161.321759][ T2255] ? __pfx_vfs_copy_file_range (fs/read_write.c:1486) 
[ 161.327234][ T2255] ? __pfx___might_resched (kernel/sched/core.c:10152) 
[ 161.332362][ T2255] __do_sys_copy_file_range (fs/read_write.c:1612) 
[ 161.337755][ T2255] ? __pfx___do_sys_copy_file_range (fs/read_write.c:1578) 
[ 161.343661][ T2255] ? f2fs_llseek (arch/x86/include/asm/bitops.h:206 (discriminator 1) arch/x86/include/asm/bitops.h:238 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:142 (discriminator 1) fs/f2fs/f2fs.h:3056 (discriminator 1) fs/f2fs/f2fs.h:3210 (discriminator 1) fs/f2fs/file.c:518 (discriminator 1)) f2fs
[ 161.348813][ T2255] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) 
[ 161.353151][ T2255] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) 
[  161.358886][ T2255] RIP: 0033:0x7f623f2a1719
[ 161.363137][ T2255] Code: 08 89 e8 5b 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b7 06 0d 00 f7 d8 64 89 01 48
All code
========
   0:	08 89 e8 5b 5d c3    	or     %cl,-0x3ca2a418(%rcx)
   6:	66 2e 0f 1f 84 00 00 	nopw   %cs:0x0(%rax,%rax,1)
   d:	00 00 00 
  10:	90                   	nop
  11:	48 89 f8             	mov    %rdi,%rax
  14:	48 89 f7             	mov    %rsi,%rdi
  17:	48 89 d6             	mov    %rdx,%rsi
  1a:	48 89 ca             	mov    %rcx,%rdx
  1d:	4d 89 c2             	mov    %r8,%r10
  20:	4d 89 c8             	mov    %r9,%r8
  23:	4c 8b 4c 24 08       	mov    0x8(%rsp),%r9
  28:	0f 05                	syscall 
  2a:*	48 3d 01 f0 ff ff    	cmp    $0xfffffffffffff001,%rax		<-- trapping instruction
  30:	73 01                	jae    0x33
  32:	c3                   	retq   
  33:	48 8b 0d b7 06 0d 00 	mov    0xd06b7(%rip),%rcx        # 0xd06f1
  3a:	f7 d8                	neg    %eax
  3c:	64 89 01             	mov    %eax,%fs:(%rcx)
  3f:	48                   	rex.W

Code starting with the faulting instruction
===========================================
   0:	48 3d 01 f0 ff ff    	cmp    $0xfffffffffffff001,%rax
   6:	73 01                	jae    0x9
   8:	c3                   	retq   
   9:	48 8b 0d b7 06 0d 00 	mov    0xd06b7(%rip),%rcx        # 0xd06c7
  10:	f7 d8                	neg    %eax
  12:	64 89 01             	mov    %eax,%fs:(%rcx)
  15:	48                   	rex.W


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240517/202405171532.760e22d3-oliver.sang@intel.com
Chao Yu June 6, 2024, 10:25 a.m. UTC | #8
On 2024/5/15 16:32, Wu Bo wrote:
> On Fri, May 10, 2024 at 10:39:06AM +0800, Chao Yu wrote:
>> If lfs mode is on, buffered read may race w/ OPU dio write as below,
>> it may cause buffered read hits unwritten data unexpectly, and for
>> dio read, the race condition exists as well.
>>
>> Thread A                      Thread B
>> - f2fs_file_write_iter
>>   - f2fs_dio_write_iter
>>    - __iomap_dio_rw
>>     - f2fs_iomap_begin
>>      - f2fs_map_blocks
>>       - __allocate_data_block
>>        - allocated blkaddr #x
>>         - iomap_dio_submit_bio
>>                                - f2fs_file_read_iter
>>                                 - filemap_read
>>                                  - f2fs_read_data_folio
>>                                   - f2fs_mpage_readpages
>>                                    - f2fs_map_blocks
>>                                     : get blkaddr #x
>>                                    - f2fs_submit_read_bio
>>                                IRQ
>>                                - f2fs_read_end_io
>>                                 : read IO on blkaddr #x complete
>> IRQ
>> - iomap_dio_bio_end_io
>>   : direct write IO on blkaddr #x complete
>>
> Looks like every COW filesystem would meet this situation. What's the solution
> of other FS?

I missed to reply this...

Other cow filesystem like btrfs, it will update metadata after data IO completion,
so it is safe.

Thanks,

>> This patch introduces a new per-inode i_opu_rwsem lock to avoid
>> such race condition.
>>
Chao Yu June 6, 2024, 10:31 a.m. UTC | #9
On 2024/5/15 14:38, Chao Yu wrote:
> On 2024/5/15 12:42, Jaegeuk Kim wrote:
>> On 05/15, Chao Yu wrote:
>>> On 2024/5/15 0:09, Jaegeuk Kim wrote:
>>>> On 05/10, Chao Yu wrote:
>>>>> If lfs mode is on, buffered read may race w/ OPU dio write as below,
>>>>> it may cause buffered read hits unwritten data unexpectly, and for
>>>>> dio read, the race condition exists as well.
>>>>>
>>>>> Thread A                      Thread B
>>>>> - f2fs_file_write_iter
>>>>>    - f2fs_dio_write_iter
>>>>>     - __iomap_dio_rw
>>>>>      - f2fs_iomap_begin
>>>>>       - f2fs_map_blocks
>>>>>        - __allocate_data_block
>>>>>         - allocated blkaddr #x
>>>>>          - iomap_dio_submit_bio
>>>>>                                 - f2fs_file_read_iter
>>>>>                                  - filemap_read
>>>>>                                   - f2fs_read_data_folio
>>>>>                                    - f2fs_mpage_readpages
>>>>>                                     - f2fs_map_blocks
>>>>>                                      : get blkaddr #x
>>>>>                                     - f2fs_submit_read_bio
>>>>>                                 IRQ
>>>>>                                 - f2fs_read_end_io
>>>>>                                  : read IO on blkaddr #x complete
>>>>> IRQ
>>>>> - iomap_dio_bio_end_io
>>>>>    : direct write IO on blkaddr #x complete
>>>>>
>>>>> This patch introduces a new per-inode i_opu_rwsem lock to avoid
>>>>> such race condition.
>>>>
>>>> Wasn't this supposed to be managed by user-land?
>>>
>>> Actually, the test case is:
>>>
>>> 1. mount w/ lfs mode
>>> 2. touch file;
>>> 3. initialize file w/ 4k zeroed data; fsync;
>>> 4. continue triggering dio write 4k zeroed data to file;
>>> 5. and meanwhile, continue triggering buf/dio 4k read in file,
>>> use md5sum to verify the 4k data;
>>>
>>> It expects data is all zero, however it turned out it's not.
>>
>> Can we check outstanding write bios instead of abusing locks?

Jaegeuk, seems it can solve partial race cases, not all of them.

Do you suggest to use this compromised solution?

Thanks,

> 
> I didn't figure out a way to solve this w/o lock, due to:
> - write bios can be issued after outstanding write bios check condition,
> result in the race.
> - once read() detects that there are outstanding write bios, we need to
> delay read flow rather than fail it, right? It looks using a lock is more
> proper here?
> 
> Any suggestion?
> 
> Thanks,
> 
>>
>>>
>>> Thanks,
>>>
>>>>
>>>>>
>>>>> Fixes: f847c699cff3 ("f2fs: allow out-place-update for direct IO in LFS mode")
>>>>> Signed-off-by: Chao Yu <chao@kernel.org>
>>>>> ---
>>>>> v2:
>>>>> - fix to cover dio read path w/ i_opu_rwsem as well.
>>>>>    fs/f2fs/f2fs.h  |  1 +
>>>>>    fs/f2fs/file.c  | 28 ++++++++++++++++++++++++++--
>>>>>    fs/f2fs/super.c |  1 +
>>>>>    3 files changed, 28 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>>>> index 30058e16a5d0..91cf4b3d6bc6 100644
>>>>> --- a/fs/f2fs/f2fs.h
>>>>> +++ b/fs/f2fs/f2fs.h
>>>>> @@ -847,6 +847,7 @@ struct f2fs_inode_info {
>>>>>         /* avoid racing between foreground op and gc */
>>>>>         struct f2fs_rwsem i_gc_rwsem[2];
>>>>>         struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
>>>>> +     struct f2fs_rwsem i_opu_rwsem;  /* avoid racing between buf read and opu dio write */
>>>>>
>>>>>         int i_extra_isize;              /* size of extra space located in i_addr */
>>>>>         kprojid_t i_projid;             /* id for project quota */
>>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>>> index 72ce1a522fb2..4ec260af321f 100644
>>>>> --- a/fs/f2fs/file.c
>>>>> +++ b/fs/f2fs/file.c
>>>>> @@ -4445,6 +4445,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>>         const loff_t pos = iocb->ki_pos;
>>>>>         const size_t count = iov_iter_count(to);
>>>>>         struct iomap_dio *dio;
>>>>> +     bool do_opu = f2fs_lfs_mode(sbi);
>>>>>         ssize_t ret;
>>>>>
>>>>>         if (count == 0)
>>>>> @@ -4457,8 +4458,14 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>>                         ret = -EAGAIN;
>>>>>                         goto out;
>>>>>                 }
>>>>> +             if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
>>>>> +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>> +                     ret = -EAGAIN;
>>>>> +                     goto out;
>>>>> +             }
>>>>>         } else {
>>>>>                 f2fs_down_read(&fi->i_gc_rwsem[READ]);
>>>>> +             f2fs_down_read(&fi->i_opu_rwsem);
>>>>>         }
>>>>>
>>>>>         /*
>>>>> @@ -4477,6 +4484,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>>                 ret = iomap_dio_complete(dio);
>>>>>         }
>>>>>
>>>>> +     f2fs_up_read(&fi->i_opu_rwsem);
>>>>>         f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>>
>>>>>         file_accessed(file);
>>>>> @@ -4523,7 +4531,13 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>>         if (f2fs_should_use_dio(inode, iocb, to)) {
>>>>>                 ret = f2fs_dio_read_iter(iocb, to);
>>>>>         } else {
>>>>> +             bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
>>>>> +
>>>>> +             if (do_opu)
>>>>> +                     f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
>>>>>                 ret = filemap_read(iocb, to, 0);
>>>>> +             if (do_opu)
>>>>> +                     f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
>>>>>                 if (ret > 0)
>>>>>                         f2fs_update_iostat(F2FS_I_SB(inode), inode,
>>>>>                                                 APP_BUFFERED_READ_IO, ret);
>>>>> @@ -4748,14 +4762,22 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>>>>                         ret = -EAGAIN;
>>>>>                         goto out;
>>>>>                 }
>>>>> +             if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
>>>>> +                     f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>> +                     f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>>>> +                     ret = -EAGAIN;
>>>>> +                     goto out;
>>>>> +             }
>>>>>         } else {
>>>>>                 ret = f2fs_convert_inline_inode(inode);
>>>>>                 if (ret)
>>>>>                         goto out;
>>>>>
>>>>>                 f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
>>>>> -             if (do_opu)
>>>>> +             if (do_opu) {
>>>>>                         f2fs_down_read(&fi->i_gc_rwsem[READ]);
>>>>> +                     f2fs_down_write(&fi->i_opu_rwsem);
>>>>> +             }
>>>>>         }
>>>>>
>>>>>         /*
>>>>> @@ -4779,8 +4801,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
>>>>>                 ret = iomap_dio_complete(dio);
>>>>>         }
>>>>>
>>>>> -     if (do_opu)
>>>>> +     if (do_opu) {
>>>>> +             f2fs_up_write(&fi->i_opu_rwsem);
>>>>>                 f2fs_up_read(&fi->i_gc_rwsem[READ]);
>>>>> +     }
>>>>>         f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
>>>>>
>>>>>         if (ret < 0)
>>>>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>>>>> index daf2c4dbe150..b4ed3b094366 100644
>>>>> --- a/fs/f2fs/super.c
>>>>> +++ b/fs/f2fs/super.c
>>>>> @@ -1428,6 +1428,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
>>>>>         init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
>>>>>         init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
>>>>>         init_f2fs_rwsem(&fi->i_xattr_sem);
>>>>> +     init_f2fs_rwsem(&fi->i_opu_rwsem);
>>>>>
>>>>>         /* Will be used by directory only */
>>>>>         fi->i_dir_level = F2FS_SB(sb)->dir_level;
>>>>> -- 
>>>>> 2.40.1
> 
> 
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
diff mbox series

Patch

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 30058e16a5d0..91cf4b3d6bc6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -847,6 +847,7 @@  struct f2fs_inode_info {
 	/* avoid racing between foreground op and gc */
 	struct f2fs_rwsem i_gc_rwsem[2];
 	struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
+	struct f2fs_rwsem i_opu_rwsem;	/* avoid racing between buf read and opu dio write */
 
 	int i_extra_isize;		/* size of extra space located in i_addr */
 	kprojid_t i_projid;		/* id for project quota */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 72ce1a522fb2..4ec260af321f 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4445,6 +4445,7 @@  static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	const loff_t pos = iocb->ki_pos;
 	const size_t count = iov_iter_count(to);
 	struct iomap_dio *dio;
+	bool do_opu = f2fs_lfs_mode(sbi);
 	ssize_t ret;
 
 	if (count == 0)
@@ -4457,8 +4458,14 @@  static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			ret = -EAGAIN;
 			goto out;
 		}
+		if (do_opu && !f2fs_down_read_trylock(&fi->i_opu_rwsem)) {
+			f2fs_up_read(&fi->i_gc_rwsem[READ]);
+			ret = -EAGAIN;
+			goto out;
+		}
 	} else {
 		f2fs_down_read(&fi->i_gc_rwsem[READ]);
+		f2fs_down_read(&fi->i_opu_rwsem);
 	}
 
 	/*
@@ -4477,6 +4484,7 @@  static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		ret = iomap_dio_complete(dio);
 	}
 
+	f2fs_up_read(&fi->i_opu_rwsem);
 	f2fs_up_read(&fi->i_gc_rwsem[READ]);
 
 	file_accessed(file);
@@ -4523,7 +4531,13 @@  static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (f2fs_should_use_dio(inode, iocb, to)) {
 		ret = f2fs_dio_read_iter(iocb, to);
 	} else {
+		bool do_opu = f2fs_lfs_mode(F2FS_I_SB(inode));
+
+		if (do_opu)
+			f2fs_down_read(&F2FS_I(inode)->i_opu_rwsem);
 		ret = filemap_read(iocb, to, 0);
+		if (do_opu)
+			f2fs_up_read(&F2FS_I(inode)->i_opu_rwsem);
 		if (ret > 0)
 			f2fs_update_iostat(F2FS_I_SB(inode), inode,
 						APP_BUFFERED_READ_IO, ret);
@@ -4748,14 +4762,22 @@  static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
 			ret = -EAGAIN;
 			goto out;
 		}
+		if (do_opu && !f2fs_down_write_trylock(&fi->i_opu_rwsem)) {
+			f2fs_up_read(&fi->i_gc_rwsem[READ]);
+			f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
+			ret = -EAGAIN;
+			goto out;
+		}
 	} else {
 		ret = f2fs_convert_inline_inode(inode);
 		if (ret)
 			goto out;
 
 		f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
-		if (do_opu)
+		if (do_opu) {
 			f2fs_down_read(&fi->i_gc_rwsem[READ]);
+			f2fs_down_write(&fi->i_opu_rwsem);
+		}
 	}
 
 	/*
@@ -4779,8 +4801,10 @@  static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
 		ret = iomap_dio_complete(dio);
 	}
 
-	if (do_opu)
+	if (do_opu) {
+		f2fs_up_write(&fi->i_opu_rwsem);
 		f2fs_up_read(&fi->i_gc_rwsem[READ]);
+	}
 	f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
 
 	if (ret < 0)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index daf2c4dbe150..b4ed3b094366 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1428,6 +1428,7 @@  static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
 	init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
 	init_f2fs_rwsem(&fi->i_xattr_sem);
+	init_f2fs_rwsem(&fi->i_opu_rwsem);
 
 	/* Will be used by directory only */
 	fi->i_dir_level = F2FS_SB(sb)->dir_level;