[PATCHv5,2/3] ext4: Start with shared i_rwsem in case of DIO instead of exclusive

Message ID	20191212055557.11151-3-riteshh@linux.ibm.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=kM2h=2C=vger.kernel.org=linux-fsdevel-owner@kernel.org> Gateway: Authorized Use Only! Violators will be prosecuted for <linux-fsdevel@vger.kernel.org> from <riteshh@linux.ibm.com>; Thu, 12 Dec 2019 05:56:10 -0000 Gateway: Authorized Use Only! Violators will be prosecuted; (version=TLSv1/SSLv3 cipher=AES256-GCM-SHA384 bits=256/256) Thu, 12 Dec 2019 05:56:07 -0000 From: Ritesh Harjani <riteshh@linux.ibm.com> To: jack@suse.cz, tytso@mit.edu, linux-ext4@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org, mbobrowski@mbobrowski.org, joseph.qi@linux.alibaba.com, Ritesh Harjani <riteshh@linux.ibm.com> Subject: [PATCHv5 2/3] ext4: Start with shared i_rwsem in case of DIO instead of exclusive Date: Thu, 12 Dec 2019 11:25:56 +0530 In-Reply-To: <20191212055557.11151-1-riteshh@linux.ibm.com> References: <20191212055557.11151-1-riteshh@linux.ibm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Message-Id: <20191212055557.11151-3-riteshh@linux.ibm.com> Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk
Series	Fix inode_lock sequence to scale performance of DIO mixed R/W workload \| expand [PATCHv5,0/3] Fix inode_lock sequence to scale performance of DIO mixed R/W workload [PATCHv5,1/3] ext4: fix ext4_dax_read/write inode locking sequence for IOCB_NOWAIT [PATCHv5,2/3] ext4: Start with shared i_rwsem in case of DIO instead of exclusive [PATCHv5,3/3] ext4: Move to shared i_rwsem even without dioread_nolock mount opt

diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 977ac58dc718..1da49dffa3df 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -166,19 +166,25 @@ static int ext4_release_file(struct inode *inode, struct file *filp) * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ -static int -ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) +static bool +ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; - int blockmask = sb->s_blocksize - 1; - - if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) - return 0; + unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) - return 1; + return true; - return 0; + return false; +} + +static bool +ext4_extending_io(struct inode *inode, loff_t offset, size_t len) +{ + if (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize) + return true; + return false; } /* Is IO overwriting allocated and initialized blocks? */ @@ -204,7 +210,8 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); } -static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_generic_write_checks(struct kiocb *iocb, + struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; @@ -228,11 +235,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + return iov_iter_count(from); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret, count; + + count = ext4_generic_write_checks(iocb, from); + if (count <= 0) + return count; + ret = file_modified(iocb->ki_filp); if (ret) return ret; - - return iov_iter_count(from); + return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, @@ -364,62 +381,139 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; +/* + * The intention here is to start with shared lock acquired then see if any + * condition requires an exclusive inode lock. If yes, then we restart the + * whole operation by releasing the shared lock and acquiring exclusive lock. + * + * - For unaligned_io we never take shared lock as it may cause data corruption + * when two unaligned IO tries to modify the same block e.g. while zeroing. + * + * - For extending writes case we don't take the shared lock, since it requires + * updating inode i_disksize and/or orphan handling with exclusive lock. + * + * - shared locking will only be true mostly with overwrites in dioread_nolock + * mode. Otherwise we will switch to exclusive i_rwsem lock. + */ +static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + bool *ilock_shared, bool *extend) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t offset; + size_t count; + ssize_t ret; + +restart: + ret = ext4_generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = ret; + if (ext4_extending_io(inode, offset, count)) + *extend = true; + /* + * Determine whether the IO operation will overwrite allocated + * and initialized blocks. If so, check to see whether it is + * possible to take the dioread_nolock path. + * + * We need exclusive i_rwsem for changing security info + * in file_modified(). + */ + if (*ilock_shared && (!IS_NOSEC(inode) || *extend || + !ext4_should_dioread_nolock(inode) || + !ext4_overwrite_io(inode, offset, count))) { + inode_unlock_shared(inode); + *ilock_shared = false; + inode_lock(inode); + goto restart; + } + + ret = file_modified(file); + if (ret < 0) + goto out; + + return count; +out: + if (*ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ret; +} + static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; - size_t count; - loff_t offset; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); - bool extend = false, overwrite = false, unaligned_aio = false; + loff_t offset = iocb->ki_pos; + size_t count = iov_iter_count(from); + bool extend = false, unaligned_io = false; + bool ilock_shared = true; + + /* + * We initially start with shared inode lock unless it is + * unaligned IO which needs exclusive lock anyways. + */ + if (ext4_unaligned_io(inode, from, offset)) { + unaligned_io = true; + ilock_shared = false; + } + /* + * Quick check here without any i_rwsem lock to see if it is extending + * IO. A more reliable check is done in ext4_dio_write_checks() with + * proper locking in place. + */ + if (offset + count > i_size_read(inode)) + ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; + if (ilock_shared) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + if (!inode_trylock(inode)) + return -EAGAIN; + } } else { - inode_lock(inode); + if (ilock_shared) + inode_lock_shared(inode); + else + inode_lock(inode); } + /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_dio_supported(inode)) { - inode_unlock(inode); - /* - * Fallback to buffered I/O if the inode does not support - * direct I/O. - */ + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } - ret = ext4_write_checks(iocb, from); - if (ret <= 0) { - inode_unlock(inode); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + if (ret <= 0) return ret; - } - /* - * Unaligned asynchronous direct I/O must be serialized among each - * other as the zeroing of partial blocks of two competing unaligned - * asynchronous direct I/O writes can result in data corruption. - */ offset = iocb->ki_pos; - count = iov_iter_count(from); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { - unaligned_aio = true; - inode_dio_wait(inode); - } + count = ret; /* - * Determine whether the I/O will overwrite allocated and initialized - * blocks. If so, check to see whether it is possible to take the - * dioread_nolock path. + * Unaligned direct IO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned IOs can result in data + * corruption. + * + * So we make sure we don't allow any unaligned IO in flight. + * For IOs where we need not wait (like unaligned non-AIO DIO), + * below inode_dio_wait() may anyway become a no-op, since we start + * with exclusive lock. */ - if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && - ext4_should_dioread_nolock(inode)) { - overwrite = true; - downgrade_write(&inode->i_rwsem); - } + if (unaligned_io) + inode_dio_wait(inode); - if (offset + count > EXT4_I(inode)->i_disksize) { + if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -432,18 +526,17 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - extend = true; ext4_journal_stop(handle); } ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_aio || extend); + is_sync_kiocb(iocb) || unaligned_io || extend); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: - if (overwrite) + if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode);

[PATCHv5,2/3] ext4: Start with shared i_rwsem in case of DIO instead of exclusive

Commit Message

Patch