diff mbox series

block: hold ->invalidate_lock in blkdev_fallocate

Message ID 20210915123545.1000534-1-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series block: hold ->invalidate_lock in blkdev_fallocate | expand

Commit Message

Ming Lei Sept. 15, 2021, 12:35 p.m. UTC
When running ->fallocate(), blkdev_fallocate() should hold
mapping->invalidate_lock to prevent page cache from being
accessed, otherwise stale data may be read in page cache.

Without this patch, blktests block/009 fails sometimes. With
this patch, block/009 can pass always.

Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 fs/block_dev.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

Comments

kernel test robot Sept. 15, 2021, 2:41 p.m. UTC | #1
Hi Ming,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on v5.14-rc7]
[cannot apply to v5.15-rc1 next-20210915]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ming-Lei/block-hold-invalidate_lock-in-blkdev_fallocate/20210915-203759
base:    e22ce8eb631bdc47a4a4ea7ecf4e4ba499db4f93
config: nds32-defconfig (attached as .config)
compiler: nds32le-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/802c451a54927267d87a707d2a5fe3e47bc4ca1c
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ming-Lei/block-hold-invalidate_lock-in-blkdev_fallocate/20210915-203759
        git checkout 802c451a54927267d87a707d2a5fe3e47bc4ca1c
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=nds32 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   fs/block_dev.c: In function 'blkdev_fallocate':
>> fs/block_dev.c:1730:9: error: implicit declaration of function 'filemap_invalidate_lock' [-Werror=implicit-function-declaration]
    1730 |         filemap_invalidate_lock(inode->i_mapping);
         |         ^~~~~~~~~~~~~~~~~~~~~~~
>> fs/block_dev.c:1762:9: error: implicit declaration of function 'filemap_invalidate_unlock' [-Werror=implicit-function-declaration]
    1762 |         filemap_invalidate_unlock(inode->i_mapping);
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/filemap_invalidate_lock +1730 fs/block_dev.c

  1694	
  1695	#define	BLKDEV_FALLOC_FL_SUPPORTED					\
  1696			(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
  1697			 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
  1698	
  1699	static long blkdev_fallocate(struct file *file, int mode, loff_t start,
  1700				     loff_t len)
  1701	{
  1702		struct inode *inode = bdev_file_inode(file);
  1703		struct block_device *bdev = I_BDEV(inode);
  1704		loff_t end = start + len - 1;
  1705		loff_t isize;
  1706		int error;
  1707	
  1708		/* Fail if we don't recognize the flags. */
  1709		if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
  1710			return -EOPNOTSUPP;
  1711	
  1712		/* Don't go off the end of the device. */
  1713		isize = i_size_read(bdev->bd_inode);
  1714		if (start >= isize)
  1715			return -EINVAL;
  1716		if (end >= isize) {
  1717			if (mode & FALLOC_FL_KEEP_SIZE) {
  1718				len = isize - start;
  1719				end = start + len - 1;
  1720			} else
  1721				return -EINVAL;
  1722		}
  1723	
  1724		/*
  1725		 * Don't allow IO that isn't aligned to logical block size.
  1726		 */
  1727		if ((start | len) & (bdev_logical_block_size(bdev) - 1))
  1728			return -EINVAL;
  1729	
> 1730		filemap_invalidate_lock(inode->i_mapping);
  1731	
  1732		/* Invalidate the page cache, including dirty pages. */
  1733		error = truncate_bdev_range(bdev, file->f_mode, start, end);
  1734		if (error)
  1735			goto fail;
  1736	
  1737		switch (mode) {
  1738		case FALLOC_FL_ZERO_RANGE:
  1739		case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
  1740			error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
  1741						    GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
  1742			break;
  1743		case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
  1744			error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
  1745						     GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
  1746			break;
  1747		case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
  1748			error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
  1749						     GFP_KERNEL, 0);
  1750			break;
  1751		default:
  1752			error = -EOPNOTSUPP;
  1753		}
  1754		/*
  1755		 * Invalidate the page cache again; if someone wandered in and dirtied
  1756		 * a page, we just discard it - userspace has no way of knowing whether
  1757		 * the write happened before or after discard completing...
  1758		 */
  1759		if (!error)
  1760			error = truncate_bdev_range(bdev, file->f_mode, start, end);
  1761	 fail:
> 1762		filemap_invalidate_unlock(inode->i_mapping);
  1763		return error;
  1764	}
  1765	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Jan Kara Sept. 16, 2021, 2:16 p.m. UTC | #2
On Wed 15-09-21 20:35:45, Ming Lei wrote:
> When running ->fallocate(), blkdev_fallocate() should hold
> mapping->invalidate_lock to prevent page cache from being
> accessed, otherwise stale data may be read in page cache.
> 
> Without this patch, blktests block/009 fails sometimes. With
> this patch, block/009 can pass always.
> 
> Cc: Jan Kara <jack@suse.cz>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>

Interesting. Looking at block/009 testcase I'm somewhat struggling to see
how it could trigger a situation where stale data would be seen. Do you
have some independent read accesses to the block device while the testcase
is running? That would explain it and yes, this patch should fix that.

BTW, you'd need to rebase this against current Linus' tree - Christoph has
moved this code to block/...

Also with the invalidate_lock held, there's no need for the second
truncate_bdev_range() call anymore. No pages can be created in the
discarded area while you are holding the invalidate_lock.

								Honza

> ---
>  fs/block_dev.c | 18 +++++++++++-------
>  1 file changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 45df6cbccf12..f55e14ae89a0 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1516,7 +1516,8 @@ static const struct address_space_operations def_blk_aops = {
>  static long blkdev_fallocate(struct file *file, int mode, loff_t start,
>  			     loff_t len)
>  {
> -	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
> +	struct inode *inode = bdev_file_inode(file);
> +	struct block_device *bdev = I_BDEV(inode);
>  	loff_t end = start + len - 1;
>  	loff_t isize;
>  	int error;
> @@ -1543,10 +1544,12 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
>  	if ((start | len) & (bdev_logical_block_size(bdev) - 1))
>  		return -EINVAL;
>  
> +	filemap_invalidate_lock(inode->i_mapping);
> +
>  	/* Invalidate the page cache, including dirty pages. */
>  	error = truncate_bdev_range(bdev, file->f_mode, start, end);
>  	if (error)
> -		return error;
> +		goto fail;
>  
>  	switch (mode) {
>  	case FALLOC_FL_ZERO_RANGE:
> @@ -1563,17 +1566,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
>  					     GFP_KERNEL, 0);
>  		break;
>  	default:
> -		return -EOPNOTSUPP;
> +		error = -EOPNOTSUPP;
>  	}
> -	if (error)
> -		return error;
> -
>  	/*
>  	 * Invalidate the page cache again; if someone wandered in and dirtied
>  	 * a page, we just discard it - userspace has no way of knowing whether
>  	 * the write happened before or after discard completing...
>  	 */
> -	return truncate_bdev_range(bdev, file->f_mode, start, end);
> +	if (!error)
> +		error = truncate_bdev_range(bdev, file->f_mode, start, end);
> + fail:
> +	filemap_invalidate_unlock(inode->i_mapping);
> +	return error;
>  }
>  
>  const struct file_operations def_blk_fops = {
> -- 
> 2.31.1
>
Ming Lei Sept. 23, 2021, 1:51 a.m. UTC | #3
On Thu, Sep 16, 2021 at 04:16:55PM +0200, Jan Kara wrote:
> On Wed 15-09-21 20:35:45, Ming Lei wrote:
> > When running ->fallocate(), blkdev_fallocate() should hold
> > mapping->invalidate_lock to prevent page cache from being
> > accessed, otherwise stale data may be read in page cache.
> > 
> > Without this patch, blktests block/009 fails sometimes. With
> > this patch, block/009 can pass always.
> > 
> > Cc: Jan Kara <jack@suse.cz>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> 
> Interesting. Looking at block/009 testcase I'm somewhat struggling to see
> how it could trigger a situation where stale data would be seen. Do you
> have some independent read accesses to the block device while the testcase
> is running? That would explain it and yes, this patch should fix that.

Yeah, the issue can be reproduced quite easily if independent read is
run. However, sometimes it can be triggered without this read, since
systemd-udev or reading partition table may do that too.

> 
> BTW, you'd need to rebase this against current Linus' tree - Christoph has
> moved this code to block/...

OK.

> 
> Also with the invalidate_lock held, there's no need for the second
> truncate_bdev_range() call anymore. No pages can be created in the
> discarded area while you are holding the invalidate_lock.

Good catch, will remove the 2nd truncate_bdev_range() in V2.


Thanks, 
Ming
diff mbox series

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 45df6cbccf12..f55e14ae89a0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1516,7 +1516,8 @@  static const struct address_space_operations def_blk_aops = {
 static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 			     loff_t len)
 {
-	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+	struct inode *inode = bdev_file_inode(file);
+	struct block_device *bdev = I_BDEV(inode);
 	loff_t end = start + len - 1;
 	loff_t isize;
 	int error;
@@ -1543,10 +1544,12 @@  static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 	if ((start | len) & (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
+	filemap_invalidate_lock(inode->i_mapping);
+
 	/* Invalidate the page cache, including dirty pages. */
 	error = truncate_bdev_range(bdev, file->f_mode, start, end);
 	if (error)
-		return error;
+		goto fail;
 
 	switch (mode) {
 	case FALLOC_FL_ZERO_RANGE:
@@ -1563,17 +1566,18 @@  static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 					     GFP_KERNEL, 0);
 		break;
 	default:
-		return -EOPNOTSUPP;
+		error = -EOPNOTSUPP;
 	}
-	if (error)
-		return error;
-
 	/*
 	 * Invalidate the page cache again; if someone wandered in and dirtied
 	 * a page, we just discard it - userspace has no way of knowing whether
 	 * the write happened before or after discard completing...
 	 */
-	return truncate_bdev_range(bdev, file->f_mode, start, end);
+	if (!error)
+		error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ fail:
+	filemap_invalidate_unlock(inode->i_mapping);
+	return error;
 }
 
 const struct file_operations def_blk_fops = {