diff mbox series

[1/2] mm: Protect operations adding pages to page cache with i_mapping_lock

Message ID 20210208163918.7871-2-jack@suse.cz (mailing list archive)
State New, archived
Headers show
Series fs: Hole punch vs page cache filling races | expand

Commit Message

Jan Kara Feb. 8, 2021, 4:39 p.m. UTC
Currently, serializing operations such as page fault, read, or readahead
against hole punching is rather difficult. The basic race scheme is
like:

fallocate(FALLOC_FL_PUNCH_HOLE)			read / fault / ..
  truncate_inode_pages_range()
						  <create pages in page
						   cache here>
  <update fs block mapping and free blocks>

Now the problem is in this way read / page fault / readahead can
instantiate pages in page cache with potentially stale data (if blocks
get quickly reused). Avoiding this race is not simple - page locks do
not work because we want to make sure there are *no* pages in given
range. inode->i_rwsem does not work because page fault happens under
mmap_sem which ranks below inode->i_rwsem. Also using it for reads makes
the performance for mixed read-write workloads suffer.

So create a new rw_semaphore in the inode - i_mapping_sem - that
protects adding of pages to page cache for page faults / reads /
readahead.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/inode.c         |  3 +++
 include/linux/fs.h |  2 ++
 mm/filemap.c       | 45 +++++++++++++++++++++++++++++++++++++++++++--
 mm/readahead.c     |  2 ++
 4 files changed, 50 insertions(+), 2 deletions(-)

Comments

Dave Chinner Feb. 9, 2021, 1:12 a.m. UTC | #1
On Mon, Feb 08, 2021 at 05:39:17PM +0100, Jan Kara wrote:
> Currently, serializing operations such as page fault, read, or readahead
> against hole punching is rather difficult. The basic race scheme is
> like:
> 
> fallocate(FALLOC_FL_PUNCH_HOLE)			read / fault / ..
>   truncate_inode_pages_range()
> 						  <create pages in page
> 						   cache here>
>   <update fs block mapping and free blocks>
> 
> Now the problem is in this way read / page fault / readahead can
> instantiate pages in page cache with potentially stale data (if blocks
> get quickly reused). Avoiding this race is not simple - page locks do
> not work because we want to make sure there are *no* pages in given
> range. inode->i_rwsem does not work because page fault happens under
> mmap_sem which ranks below inode->i_rwsem. Also using it for reads makes
> the performance for mixed read-write workloads suffer.
> 
> So create a new rw_semaphore in the inode - i_mapping_sem - that
> protects adding of pages to page cache for page faults / reads /
> readahead.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/inode.c         |  3 +++
>  include/linux/fs.h |  2 ++
>  mm/filemap.c       | 45 +++++++++++++++++++++++++++++++++++++++++++--
>  mm/readahead.c     |  2 ++
>  4 files changed, 50 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index 6442d97d9a4a..8df49d98e1cd 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -174,6 +174,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
>  
>  	init_rwsem(&inode->i_rwsem);
>  	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
> +	init_rwsem(&inode->i_mapping_sem);
> +	lockdep_set_class(&inode->i_mapping_sem,
> +			  &sb->s_type->i_mapping_sem_key);
>  
>  	atomic_set(&inode->i_dio_count, 0);
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index b20ddd8a6e62..248609bc61a2 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -658,6 +658,7 @@ struct inode {
>  	/* Misc */
>  	unsigned long		i_state;
>  	struct rw_semaphore	i_rwsem;
> +	struct rw_semaphore	i_mapping_sem;
>  
>  	unsigned long		dirtied_when;	/* jiffies of first dirtying */
>  	unsigned long		dirtied_time_when;
> @@ -2249,6 +2250,7 @@ struct file_system_type {
>  
>  	struct lock_class_key i_lock_key;
>  	struct lock_class_key i_mutex_key;
> +	struct lock_class_key i_mapping_sem_key;
>  	struct lock_class_key i_mutex_dir_key;
>  };
>  
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 16a3bf693d4a..02f778ff02e0 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2257,16 +2257,28 @@ static int filemap_update_page(struct kiocb *iocb,
>  {
>  	int error;
>  
> +	if (iocb->ki_flags & IOCB_NOWAIT) {
> +		if (!down_read_trylock(&mapping->host->i_mapping_sem))
> +			return -EAGAIN;
> +	} else {
> +		down_read(&mapping->host->i_mapping_sem);
> +	}
> +
>  	if (!trylock_page(page)) {
> -		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
> +		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) {
> +			up_read(&mapping->host->i_mapping_sem);
>  			return -EAGAIN;
> +		}
>  		if (!(iocb->ki_flags & IOCB_WAITQ)) {
> +			up_read(&mapping->host->i_mapping_sem);
>  			put_and_wait_on_page_locked(page, TASK_KILLABLE);
>  			return AOP_TRUNCATED_PAGE;
>  		}
>  		error = __lock_page_async(page, iocb->ki_waitq);
> -		if (error)
> +		if (error) {
> +			up_read(&mapping->host->i_mapping_sem);
>  			return error;
> +		}
>  	}

What tree is this against? I don't see filemap_update_page() in a
5.11-rc7 tree...

Cheers,

Dave.
Matthew Wilcox Feb. 9, 2021, 1:29 a.m. UTC | #2
On Tue, Feb 09, 2021 at 12:12:58PM +1100, Dave Chinner wrote:
> On Mon, Feb 08, 2021 at 05:39:17PM +0100, Jan Kara wrote:
> > +++ b/mm/filemap.c
> > @@ -2257,16 +2257,28 @@ static int filemap_update_page(struct kiocb *iocb,
> 
> What tree is this against? I don't see filemap_update_page() in a
> 5.11-rc7 tree...

It's in mmotm or -next.
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index 6442d97d9a4a..8df49d98e1cd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -174,6 +174,9 @@  int inode_init_always(struct super_block *sb, struct inode *inode)
 
 	init_rwsem(&inode->i_rwsem);
 	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
+	init_rwsem(&inode->i_mapping_sem);
+	lockdep_set_class(&inode->i_mapping_sem,
+			  &sb->s_type->i_mapping_sem_key);
 
 	atomic_set(&inode->i_dio_count, 0);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b20ddd8a6e62..248609bc61a2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -658,6 +658,7 @@  struct inode {
 	/* Misc */
 	unsigned long		i_state;
 	struct rw_semaphore	i_rwsem;
+	struct rw_semaphore	i_mapping_sem;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 	unsigned long		dirtied_time_when;
@@ -2249,6 +2250,7 @@  struct file_system_type {
 
 	struct lock_class_key i_lock_key;
 	struct lock_class_key i_mutex_key;
+	struct lock_class_key i_mapping_sem_key;
 	struct lock_class_key i_mutex_dir_key;
 };
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 16a3bf693d4a..02f778ff02e0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2257,16 +2257,28 @@  static int filemap_update_page(struct kiocb *iocb,
 {
 	int error;
 
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!down_read_trylock(&mapping->host->i_mapping_sem))
+			return -EAGAIN;
+	} else {
+		down_read(&mapping->host->i_mapping_sem);
+	}
+
 	if (!trylock_page(page)) {
-		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
+		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) {
+			up_read(&mapping->host->i_mapping_sem);
 			return -EAGAIN;
+		}
 		if (!(iocb->ki_flags & IOCB_WAITQ)) {
+			up_read(&mapping->host->i_mapping_sem);
 			put_and_wait_on_page_locked(page, TASK_KILLABLE);
 			return AOP_TRUNCATED_PAGE;
 		}
 		error = __lock_page_async(page, iocb->ki_waitq);
-		if (error)
+		if (error) {
+			up_read(&mapping->host->i_mapping_sem);
 			return error;
+		}
 	}
 
 	if (!page->mapping)
@@ -2283,12 +2295,14 @@  static int filemap_update_page(struct kiocb *iocb,
 	error = filemap_read_page(iocb->ki_filp, mapping, page);
 	if (error == AOP_TRUNCATED_PAGE)
 		put_page(page);
+	up_read(&mapping->host->i_mapping_sem);
 	return error;
 truncated:
 	error = AOP_TRUNCATED_PAGE;
 	put_page(page);
 unlock:
 	unlock_page(page);
+	up_read(&mapping->host->i_mapping_sem);
 	return error;
 }
 
@@ -2303,6 +2317,19 @@  static int filemap_create_page(struct file *file,
 	if (!page)
 		return -ENOMEM;
 
+	/*
+	 * Protect against truncate / hole punch. Grabbing i_mapping_sem here
+	 * assures we cannot instantiate and bring uptodate new pagecache pages
+	 * after evicting page cache during truncate and before actually
+	 * freeing blocks.  Note that we could release i_mapping_sem after
+	 * inserting the page into page cache as the locked page would then be
+	 * enough to synchronize with hole punching. But there are code paths
+	 * such as filemap_update_page() filling in partially uptodate pages or
+	 * ->readpages() that need to hold i_mapping_sem while mapping blocks
+	 * for IO so let's hold the lock here as well to keep locking rules
+	 * simple.
+	 */
+	down_read(&mapping->host->i_mapping_sem);
 	error = add_to_page_cache_lru(page, mapping, index,
 			mapping_gfp_constraint(mapping, GFP_KERNEL));
 	if (error == -EEXIST)
@@ -2314,9 +2341,11 @@  static int filemap_create_page(struct file *file,
 	if (error)
 		goto error;
 
+	up_read(&mapping->host->i_mapping_sem);
 	pagevec_add(pvec, page);
 	return 0;
 error:
+	up_read(&mapping->host->i_mapping_sem);
 	put_page(page);
 	return error;
 }
@@ -2772,6 +2801,13 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
 		fpin = do_sync_mmap_readahead(vmf);
+	}
+
+	/*
+	 * See comment in filemap_create_page() why we need i_mapping_sem
+	 */
+	down_read(&inode->i_mapping_sem);
+	if (!page) {
 retry_find:
 		page = pagecache_get_page(mapping, offset,
 					  FGP_CREAT|FGP_FOR_MMAP,
@@ -2779,6 +2815,7 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 		if (!page) {
 			if (fpin)
 				goto out_retry;
+			up_read(&inode->i_mapping_sem);
 			return VM_FAULT_OOM;
 		}
 	}
@@ -2819,9 +2856,11 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 	if (unlikely(offset >= max_off)) {
 		unlock_page(page);
 		put_page(page);
+		up_read(&inode->i_mapping_sem);
 		return VM_FAULT_SIGBUS;
 	}
 
+	up_read(&inode->i_mapping_sem);
 	vmf->page = page;
 	return ret | VM_FAULT_LOCKED;
 
@@ -2847,6 +2886,7 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 	if (!error || error == AOP_TRUNCATED_PAGE)
 		goto retry_find;
 
+	up_read(&inode->i_mapping_sem);
 	shrink_readahead_size_eio(ra);
 	return VM_FAULT_SIGBUS;
 
@@ -2858,6 +2898,7 @@  vm_fault_t filemap_fault(struct vm_fault *vmf)
 	 */
 	if (page)
 		put_page(page);
+	up_read(&inode->i_mapping_sem);
 	if (fpin)
 		fput(fpin);
 	return ret | VM_FAULT_RETRY;
diff --git a/mm/readahead.c b/mm/readahead.c
index c5b0457415be..ac5bb50b3a4c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -192,6 +192,7 @@  void page_cache_ra_unbounded(struct readahead_control *ractl,
 	 */
 	unsigned int nofs = memalloc_nofs_save();
 
+	down_read(&mapping->host->i_mapping_sem);
 	/*
 	 * Preallocate as many pages as we will need.
 	 */
@@ -236,6 +237,7 @@  void page_cache_ra_unbounded(struct readahead_control *ractl,
 	 * will then handle the error.
 	 */
 	read_pages(ractl, &page_pool, false);
+	up_read(&mapping->host->i_mapping_sem);
 	memalloc_nofs_restore(nofs);
 }
 EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);