diff mbox

[1/2] Btrfs: serialize unlocked dio reads with truncate

Message ID 510A3807.1040306@cn.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Miao Xie Jan. 31, 2013, 9:23 a.m. UTC
Currently, we can do unlocked dio reads, but the following race
is possible:

dio_read_task			truncate_task
				->btrfs_setattr()
->btrfs_direct_IO
    ->__blockdev_direct_IO
      ->btrfs_get_block
				  ->btrfs_truncate()
				 #alloc truncated blocks
				 #to other inode
      ->submit_io()
     #INFORMATION LEAK

In order to avoid this problem, we must serialize unlocked dio reads with
truncate by inode_dio_wait().

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/btrfs_inode.h | 19 +++++++++++++++++++
 fs/btrfs/inode.c       | 31 +++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 4 deletions(-)

Comments

Josef Bacik Jan. 31, 2013, 4:40 p.m. UTC | #1
On Thu, Jan 31, 2013 at 02:23:19AM -0700, Miao Xie wrote:
> Currently, we can do unlocked dio reads, but the following race
> is possible:
> 
> dio_read_task			truncate_task
> 				->btrfs_setattr()
> ->btrfs_direct_IO
>     ->__blockdev_direct_IO
>       ->btrfs_get_block
> 				  ->btrfs_truncate()
> 				 #alloc truncated blocks
> 				 #to other inode
>       ->submit_io()
>      #INFORMATION LEAK
> 
> In order to avoid this problem, we must serialize unlocked dio reads with
> truncate by inode_dio_wait().
> 

So I had thinking about this, are we sure we don't want to just lock the extent
range when we truncate?  I'm good with this, but it seems like we might as well
and be consistent and use the extent locks.  What do you think?  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miao Xie Feb. 1, 2013, 5:56 a.m. UTC | #2
On Thu, 31 Jan 2013 11:40:41 -0500, Josef Bacik wrote:
> On Thu, Jan 31, 2013 at 02:23:19AM -0700, Miao Xie wrote:
>> Currently, we can do unlocked dio reads, but the following race
>> is possible:
>>
>> dio_read_task			truncate_task
>> 				->btrfs_setattr()
>> ->btrfs_direct_IO
>>     ->__blockdev_direct_IO
>>       ->btrfs_get_block
>> 				  ->btrfs_truncate()
>> 				 #alloc truncated blocks
>> 				 #to other inode
>>       ->submit_io()
>>      #INFORMATION LEAK
>>
>> In order to avoid this problem, we must serialize unlocked dio reads with
>> truncate by inode_dio_wait().
>>
> 
> So I had thinking about this, are we sure we don't want to just lock the extent
> range when we truncate?  I'm good with this, but it seems like we might as well
> and be consistent and use the extent locks.  What do you think?  Thanks,

But comparing with the current approach, the extent lock has the following problem:
	Dio_Read_Task			Truncate_task
					truncate file
					  set isize to 4096
					  drop pages
	lock extent[4096, 8191]
	read extent[4096, 8191]
	unlock extent[4096, 8191]
					  lock extent[4096, -1ULL]
					  truncate item
					  unlock extent[4096, -1ULL]
	lock extent[8192, ...]
	read extent[8192, ...]
	  no extent item
	  zero the buffer
	unlock extent[8192, ...]

we get the data that is mixed with new data.(Punch hole also has this problem, we need
fix)

Thanks
Miao
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik Feb. 1, 2013, 2:40 p.m. UTC | #3
On Thu, Jan 31, 2013 at 10:56:34PM -0700, Miao Xie wrote:
> On Thu, 31 Jan 2013 11:40:41 -0500, Josef Bacik wrote:
> > On Thu, Jan 31, 2013 at 02:23:19AM -0700, Miao Xie wrote:
> >> Currently, we can do unlocked dio reads, but the following race
> >> is possible:
> >>
> >> dio_read_task			truncate_task
> >> 				->btrfs_setattr()
> >> ->btrfs_direct_IO
> >>     ->__blockdev_direct_IO
> >>       ->btrfs_get_block
> >> 				  ->btrfs_truncate()
> >> 				 #alloc truncated blocks
> >> 				 #to other inode
> >>       ->submit_io()
> >>      #INFORMATION LEAK
> >>
> >> In order to avoid this problem, we must serialize unlocked dio reads with
> >> truncate by inode_dio_wait().
> >>
> > 
> > So I had thinking about this, are we sure we don't want to just lock the extent
> > range when we truncate?  I'm good with this, but it seems like we might as well
> > and be consistent and use the extent locks.  What do you think?  Thanks,
> 
> But comparing with the current approach, the extent lock has the following problem:
> 	Dio_Read_Task			Truncate_task
> 					truncate file
> 					  set isize to 4096
> 					  drop pages
> 	lock extent[4096, 8191]
> 	read extent[4096, 8191]
> 	unlock extent[4096, 8191]
> 					  lock extent[4096, -1ULL]
> 					  truncate item
> 					  unlock extent[4096, -1ULL]
> 	lock extent[8192, ...]
> 	read extent[8192, ...]
> 	  no extent item
> 	  zero the buffer
> 	unlock extent[8192, ...]
> 
> we get the data that is mixed with new data.(Punch hole also has this problem, we need
> fix)

So this case is fine, since we'll still get valid data, the extents would still
be there.  If you are mixing dio reads with simultaneous truncate/hole punching
you deserve to get your ass bitten :).  The other option would be to lock before
we set the isize, or check the isize in get_extents.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242..00e2601 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,7 @@ 
 #define BTRFS_INODE_HAS_ASYNC_EXTENT		6
 #define BTRFS_INODE_NEEDS_FULL_SYNC		7
 #define BTRFS_INODE_COPY_EVERYTHING		8
+#define BTRFS_INODE_READDIO_NEED_LOCK		9
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -216,4 +217,22 @@  static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 	return 0;
 }
 
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+	set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+	smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+		  &BTRFS_I(inode)->runtime_flags);
+}
+
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 97f4c30..d17a04b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3785,6 +3785,11 @@  static int btrfs_setsize(struct inode *inode, loff_t newsize)
 
 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
 		truncate_setsize(inode, newsize);
+
+		btrfs_inode_block_unlocked_dio(inode);
+		inode_dio_wait(inode);
+		btrfs_inode_resume_unlocked_dio(inode);
+
 		ret = btrfs_truncate(inode);
 	}
 
@@ -6583,15 +6588,33 @@  static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	int flags = 0;
+	bool wakeup = false;
+	int ret;
 
 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
 			    offset, nr_segs))
 		return 0;
 
-	return __blockdev_direct_IO(rw, iocb, inode,
-		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-		   btrfs_submit_direct, 0);
+	if (rw == READ) {
+		atomic_inc(&inode->i_dio_count);
+		smp_mb__after_atomic_inc();
+		if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+				      &BTRFS_I(inode)->runtime_flags))) {
+			inode_dio_done(inode);
+			flags = DIO_LOCKING | DIO_SKIP_HOLES;
+		} else {
+			wakeup = true;
+		}
+	}
+
+	ret = __blockdev_direct_IO(rw, iocb, inode,
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+			iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+			btrfs_submit_direct, flags);
+	if (wakeup)
+		inode_dio_done(inode);
+	return ret;
 }
 
 #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)