[1/2] mm: Add readahead file operation
diff mbox series

Message ID 20190603132155.20600-2-jack@suse.cz
State New
Headers show
Series
  • fs: Hole punch vs page cache filling races
Related show

Commit Message

Jan Kara June 3, 2019, 1:21 p.m. UTC
Some filesystems need to acquire locks before pages are read into page
cache to protect from races with hole punching. The lock generally
cannot be acquired within readpage as it ranks above page lock so we are
left with acquiring the lock within filesystem's ->read_iter
implementation for normal reads and ->fault implementation during page
faults. That however does not cover all paths how pages can be
instantiated within page cache - namely explicitely requested readahead.
Add new ->readahead file operation which filesystem can use for this.

CC: stable@vger.kernel.org # Needed by following ext4 fix
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fs.h |  5 +++++
 include/linux/mm.h |  3 ---
 mm/fadvise.c       | 12 +-----------
 mm/madvise.c       |  3 ++-
 mm/readahead.c     | 26 ++++++++++++++++++++++++--
 5 files changed, 32 insertions(+), 17 deletions(-)

Comments

Amir Goldstein June 3, 2019, 4:16 p.m. UTC | #1
On Mon, Jun 3, 2019 at 4:22 PM Jan Kara <jack@suse.cz> wrote:
>
> Some filesystems need to acquire locks before pages are read into page
> cache to protect from races with hole punching. The lock generally
> cannot be acquired within readpage as it ranks above page lock so we are
> left with acquiring the lock within filesystem's ->read_iter
> implementation for normal reads and ->fault implementation during page
> faults. That however does not cover all paths how pages can be
> instantiated within page cache - namely explicitely requested readahead.
> Add new ->readahead file operation which filesystem can use for this.
>
> CC: stable@vger.kernel.org # Needed by following ext4 fix
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  include/linux/fs.h |  5 +++++
>  include/linux/mm.h |  3 ---
>  mm/fadvise.c       | 12 +-----------
>  mm/madvise.c       |  3 ++-
>  mm/readahead.c     | 26 ++++++++++++++++++++++++--
>  5 files changed, 32 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f7fdfe93e25d..9968abcd06ea 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1828,6 +1828,7 @@ struct file_operations {
>                                    struct file *file_out, loff_t pos_out,
>                                    loff_t len, unsigned int remap_flags);
>         int (*fadvise)(struct file *, loff_t, loff_t, int);
> +       int (*readahead)(struct file *, loff_t, loff_t);

The new method is redundant, because it is a subset of fadvise.
When overlayfs needed to implement both methods, Miklos
suggested that we unite them into one, hence:
3d8f7615319b vfs: implement readahead(2) using POSIX_FADV_WILLNEED

So you can accomplish the ext4 fix without the new method.
All you need extra is implementing madvise_willneed() with vfs_fadvise().

Thanks,
Amir.
Jan Kara June 4, 2019, 8 a.m. UTC | #2
On Mon 03-06-19 19:16:59, Amir Goldstein wrote:
> On Mon, Jun 3, 2019 at 4:22 PM Jan Kara <jack@suse.cz> wrote:
> >
> > Some filesystems need to acquire locks before pages are read into page
> > cache to protect from races with hole punching. The lock generally
> > cannot be acquired within readpage as it ranks above page lock so we are
> > left with acquiring the lock within filesystem's ->read_iter
> > implementation for normal reads and ->fault implementation during page
> > faults. That however does not cover all paths how pages can be
> > instantiated within page cache - namely explicitely requested readahead.
> > Add new ->readahead file operation which filesystem can use for this.
> >
> > CC: stable@vger.kernel.org # Needed by following ext4 fix
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> >  include/linux/fs.h |  5 +++++
> >  include/linux/mm.h |  3 ---
> >  mm/fadvise.c       | 12 +-----------
> >  mm/madvise.c       |  3 ++-
> >  mm/readahead.c     | 26 ++++++++++++++++++++++++--
> >  5 files changed, 32 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index f7fdfe93e25d..9968abcd06ea 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -1828,6 +1828,7 @@ struct file_operations {
> >                                    struct file *file_out, loff_t pos_out,
> >                                    loff_t len, unsigned int remap_flags);
> >         int (*fadvise)(struct file *, loff_t, loff_t, int);
> > +       int (*readahead)(struct file *, loff_t, loff_t);
> 
> The new method is redundant, because it is a subset of fadvise.
> When overlayfs needed to implement both methods, Miklos
> suggested that we unite them into one, hence:
> 3d8f7615319b vfs: implement readahead(2) using POSIX_FADV_WILLNEED

Yes, I've noticed this.

> So you can accomplish the ext4 fix without the new method.
> All you need extra is implementing madvise_willneed() with vfs_fadvise().

Ah, that's an interesting idea. I'll try that out. It will require some
dance in madvise() to drop mmap_sem but we already do that for
madvise_free() so I can just duplicate that.

								Honza

Patch
diff mbox series

diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..9968abcd06ea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1828,6 +1828,7 @@  struct file_operations {
 				   struct file *file_out, loff_t pos_out,
 				   loff_t len, unsigned int remap_flags);
 	int (*fadvise)(struct file *, loff_t, loff_t, int);
+	int (*readahead)(struct file *, loff_t, loff_t);
 } __randomize_layout;
 
 struct inode_operations {
@@ -3537,6 +3538,10 @@  extern void inode_nohighmem(struct inode *inode);
 extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
 		       int advice);
 
+/* mm/readahead.c */
+extern int generic_readahead(struct file *filp, loff_t start, loff_t end);
+extern int vfs_readahead(struct file *filp, loff_t start, loff_t end);
+
 #if defined(CONFIG_IO_URING)
 extern struct sock *io_uring_get_socket(struct file *file);
 #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e8834ac32b7..8f6597295920 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2461,9 +2461,6 @@  void task_dirty_inc(struct task_struct *tsk);
 /* readahead.c */
 #define VM_READAHEAD_PAGES	(SZ_128K / PAGE_SIZE)
 
-int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			pgoff_t offset, unsigned long nr_to_read);
-
 void page_cache_sync_readahead(struct address_space *mapping,
 			       struct file_ra_state *ra,
 			       struct file *filp,
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd032037..e5aab207550e 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -36,7 +36,6 @@  static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
 	loff_t endbyte;			/* inclusive */
 	pgoff_t start_index;
 	pgoff_t end_index;
-	unsigned long nrpages;
 
 	inode = file_inode(file);
 	if (S_ISFIFO(inode->i_mode))
@@ -94,20 +93,11 @@  static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
 		spin_unlock(&file->f_lock);
 		break;
 	case POSIX_FADV_WILLNEED:
-		/* First and last PARTIAL page! */
-		start_index = offset >> PAGE_SHIFT;
-		end_index = endbyte >> PAGE_SHIFT;
-
-		/* Careful about overflow on the "+1" */
-		nrpages = end_index - start_index + 1;
-		if (!nrpages)
-			nrpages = ~0UL;
-
 		/*
 		 * Ignore return value because fadvise() shall return
 		 * success even if filesystem can't retrieve a hint,
 		 */
-		force_page_cache_readahead(mapping, file, start_index, nrpages);
+		vfs_readahead(file, offset, endbyte);
 		break;
 	case POSIX_FADV_NOREUSE:
 		break;
diff --git a/mm/madvise.c b/mm/madvise.c
index 628022e674a7..9111b75e88cf 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -303,7 +303,8 @@  static long madvise_willneed(struct vm_area_struct *vma,
 		end = vma->vm_end;
 	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
-	force_page_cache_readahead(file->f_mapping, file, start, end - start);
+	vfs_readahead(file, (loff_t)start << PAGE_SHIFT,
+		      (loff_t)end << PAGE_SHIFT);
 	return 0;
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 2fe72cd29b47..e66ae8c764ad 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -219,8 +219,9 @@  unsigned int __do_page_cache_readahead(struct address_space *mapping,
  * Chunk the readahead into 2 megabyte units, so that we don't pin too much
  * memory at once.
  */
-int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			       pgoff_t offset, unsigned long nr_to_read)
+static int force_page_cache_readahead(struct address_space *mapping,
+				      struct file *filp, pgoff_t offset,
+				      unsigned long nr_to_read)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	struct file_ra_state *ra = &filp->f_ra;
@@ -248,6 +249,20 @@  int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	return 0;
 }
 
+int generic_readahead(struct file *filp, loff_t start, loff_t end)
+{
+	pgoff_t first, last;
+	unsigned long count;
+
+	first = start >> PAGE_SHIFT;
+	last = end >> PAGE_SHIFT;
+	count = last - first + 1;
+	if (!count)
+		count = ~0UL;
+	return force_page_cache_readahead(filp->f_mapping, filp, first, count);
+}
+EXPORT_SYMBOL_GPL(generic_readahead);
+
 /*
  * Set the initial window size, round to next power of 2 and square
  * for small size, x 4 for medium, and x 2 for large
@@ -575,6 +590,13 @@  page_cache_async_readahead(struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
 
+int vfs_readahead(struct file *filp, loff_t start, loff_t end)
+{
+	if (filp->f_op->readahead)
+		return filp->f_op->readahead(filp, start, end);
+	return generic_readahead(filp, start, end);
+}
+
 ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
 {
 	ssize_t ret;