diff mbox

[1/2] mm, dax: make pmd_fault() and friends to be the same as fault()

Message ID 148123286127.108913.2695398781030517780.stgit@djiang5-desk3.ch.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Jiang Dec. 8, 2016, 9:34 p.m. UTC
Instead of passing in multiple parameters in the pmd_fault() handler,
a vmf can be passed in just like a fault() handler. This will simplify
code and remove the need for the actual pmd fault handlers to allocate a
vmf. Related functions are also modified to do the same.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 drivers/dax/dax.c             |   16 +++++++---------
 fs/dax.c                      |   29 +++++++++++++----------------
 fs/ext4/file.c                |    9 ++++-----
 fs/xfs/xfs_file.c             |   10 ++++------
 include/linux/dax.h           |    7 +++----
 include/linux/mm.h            |    3 +--
 include/trace/events/fs_dax.h |   15 +++++++--------
 mm/memory.c                   |    6 ++----
 8 files changed, 41 insertions(+), 54 deletions(-)

Comments

Jan Kara Dec. 13, 2016, 12:15 p.m. UTC | #1
On Thu 08-12-16 14:34:21, Dave Jiang wrote:
> Instead of passing in multiple parameters in the pmd_fault() handler,
> a vmf can be passed in just like a fault() handler. This will simplify
> code and remove the need for the actual pmd fault handlers to allocate a
> vmf. Related functions are also modified to do the same.
> 
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>

I like the idea however see below:

> @@ -1377,21 +1376,20 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	if (iomap.offset + iomap.length < pos + PMD_SIZE)
>  		goto unlock_entry;
>  
> -	vmf.pgoff = pgoff;
> -	vmf.flags = flags;
> -	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
> +	vmf->pgoff = pgoff;
> +	vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;

But now it's really unexpected that you change pgoff and gfp_mask because
that will propagate back to the caller and if we return VM_FAULT_FALLBACK
we may fault in wrong PTE because of this. So dax_iomap_pmd_fault() should
not modify the passed gfp_mask, just make its callers clear __GFP_FS from
it because *they* are responsible for acquiring locks / transactions that
block __GFP_FS allocations. They are also responsible for restoring
original gfp_mask once dax_iomap_pmd_fault() returns.

dax_iomap_pmd_fault() needs to modify pgoff however it must restore it to
the original value before it returns.

Otherwise the patch looks good to me.

								Honza

>  
>  	switch (iomap.type) {
>  	case IOMAP_MAPPED:
> -		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
> -				&iomap, pos, write, &entry);
> +		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
> +				vmf->address, &iomap, pos, write, &entry);
>  		break;
>  	case IOMAP_UNWRITTEN:
>  	case IOMAP_HOLE:
>  		if (WARN_ON_ONCE(write))
>  			goto unlock_entry;
> -		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
> -				&entry);
> +		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
> +				&iomap, &entry);
>  		break;
>  	default:
>  		WARN_ON_ONCE(1);
> @@ -1417,12 +1415,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>  	}
>   fallback:
>  	if (result == VM_FAULT_FALLBACK) {
> -		split_huge_pmd(vma, pmd, address);
> +		split_huge_pmd(vma, vmf->pmd, vmf->address);
>  		count_vm_event(THP_FAULT_FALLBACK);
>  	}
>  out:
> -	trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff,
> -			result);
> +	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
>  	return result;
>  }
>  EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index d663d3d..10b64ba 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -275,21 +275,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	return result;
>  }
>  
> -static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
> -						pmd_t *pmd, unsigned int flags)
> +static int
> +ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
>  	int result;
>  	struct inode *inode = file_inode(vma->vm_file);
>  	struct super_block *sb = inode->i_sb;
> -	bool write = flags & FAULT_FLAG_WRITE;
> +	bool write = vmf->flags & FAULT_FLAG_WRITE;
>  
>  	if (write) {
>  		sb_start_pagefault(sb);
>  		file_update_time(vma->vm_file);
>  	}
>  	down_read(&EXT4_I(inode)->i_mmap_sem);
> -	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
> -				     &ext4_iomap_ops);
> +	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
>  	up_read(&EXT4_I(inode)->i_mmap_sem);
>  	if (write)
>  		sb_end_pagefault(sb);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index d818c16..df0009f 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1526,9 +1526,7 @@ xfs_filemap_fault(
>  STATIC int
>  xfs_filemap_pmd_fault(
>  	struct vm_area_struct	*vma,
> -	unsigned long		addr,
> -	pmd_t			*pmd,
> -	unsigned int		flags)
> +	struct vm_fault *vmf)
>  {
>  	struct inode		*inode = file_inode(vma->vm_file);
>  	struct xfs_inode	*ip = XFS_I(inode);
> @@ -1539,16 +1537,16 @@ xfs_filemap_pmd_fault(
>  
>  	trace_xfs_filemap_pmd_fault(ip);
>  
> -	if (flags & FAULT_FLAG_WRITE) {
> +	if (vmf->flags & FAULT_FLAG_WRITE) {
>  		sb_start_pagefault(inode->i_sb);
>  		file_update_time(vma->vm_file);
>  	}
>  
>  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> -	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
> +	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
>  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>  
> -	if (flags & FAULT_FLAG_WRITE)
> +	if (vmf->flags & FAULT_FLAG_WRITE)
>  		sb_end_pagefault(inode->i_sb);
>  
>  	return ret;
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 6e36b11..9761c90 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -71,16 +71,15 @@ static inline unsigned int dax_radix_order(void *entry)
>  		return PMD_SHIFT - PAGE_SHIFT;
>  	return 0;
>  }
> -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> -		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
> +int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> +		struct iomap_ops *ops);
>  #else
>  static inline unsigned int dax_radix_order(void *entry)
>  {
>  	return 0;
>  }
>  static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
> -		unsigned long address, pmd_t *pmd, unsigned int flags,
> -		struct iomap_ops *ops)
> +		struct vm_fault *vmf, struct iomap_ops *ops)
>  {
>  	return VM_FAULT_FALLBACK;
>  }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 30f416a..aef645b 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -347,8 +347,7 @@ struct vm_operations_struct {
>  	void (*close)(struct vm_area_struct * area);
>  	int (*mremap)(struct vm_area_struct * area);
>  	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
> -	int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
> -						pmd_t *, unsigned int flags);
> +	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
>  	void (*map_pages)(struct vm_fault *vmf,
>  			pgoff_t start_pgoff, pgoff_t end_pgoff);
>  
> diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
> index c3b0aae..a98665b 100644
> --- a/include/trace/events/fs_dax.h
> +++ b/include/trace/events/fs_dax.h
> @@ -8,9 +8,8 @@
>  
>  DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>  	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
> -		unsigned long address, unsigned int flags, pgoff_t pgoff,
> -		pgoff_t max_pgoff, int result),
> -	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result),
> +		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
> +	TP_ARGS(inode, vma, vmf, max_pgoff, result),
>  	TP_STRUCT__entry(
>  		__field(unsigned long, ino)
>  		__field(unsigned long, vm_start)
> @@ -29,9 +28,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>  		__entry->vm_start = vma->vm_start;
>  		__entry->vm_end = vma->vm_end;
>  		__entry->vm_flags = vma->vm_flags;
> -		__entry->address = address;
> -		__entry->flags = flags;
> -		__entry->pgoff = pgoff;
> +		__entry->address = vmf->address;
> +		__entry->flags = vmf->flags;
> +		__entry->pgoff = vmf->pgoff;
>  		__entry->max_pgoff = max_pgoff;
>  		__entry->result = result;
>  	),
> @@ -54,9 +53,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>  #define DEFINE_PMD_FAULT_EVENT(name) \
>  DEFINE_EVENT(dax_pmd_fault_class, name, \
>  	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
> -		unsigned long address, unsigned int flags, pgoff_t pgoff, \
> +		struct vm_fault *vmf, \
>  		pgoff_t max_pgoff, int result), \
> -	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result))
> +	TP_ARGS(inode, vma, vmf, max_pgoff, result))
>  
>  DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
>  DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
> diff --git a/mm/memory.c b/mm/memory.c
> index e37250f..8ec36cf 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3447,8 +3447,7 @@ static int create_huge_pmd(struct vm_fault *vmf)
>  	if (vma_is_anonymous(vma))
>  		return do_huge_pmd_anonymous_page(vmf);
>  	if (vma->vm_ops->pmd_fault)
> -		return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
> -				vmf->flags);
> +		return vma->vm_ops->pmd_fault(vma, vmf);
>  	return VM_FAULT_FALLBACK;
>  }
>  
> @@ -3457,8 +3456,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
>  	if (vma_is_anonymous(vmf->vma))
>  		return do_huge_pmd_wp_page(vmf, orig_pmd);
>  	if (vmf->vma->vm_ops->pmd_fault)
> -		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
> -				vmf->pmd, vmf->flags);
> +		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
>  
>  	/* COW handled on pte level: split pmd */
>  	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
>
Dave Jiang Dec. 13, 2016, 6:29 p.m. UTC | #2
On 12/13/2016 05:15 AM, Jan Kara wrote:
> On Thu 08-12-16 14:34:21, Dave Jiang wrote:
>> Instead of passing in multiple parameters in the pmd_fault() handler,
>> a vmf can be passed in just like a fault() handler. This will simplify
>> code and remove the need for the actual pmd fault handlers to allocate a
>> vmf. Related functions are also modified to do the same.
>>
>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> 
> I like the idea however see below:
> 
>> @@ -1377,21 +1376,20 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>>  	if (iomap.offset + iomap.length < pos + PMD_SIZE)
>>  		goto unlock_entry;
>>  
>> -	vmf.pgoff = pgoff;
>> -	vmf.flags = flags;
>> -	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
>> +	vmf->pgoff = pgoff;
>> +	vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
> 
> But now it's really unexpected that you change pgoff and gfp_mask because
> that will propagate back to the caller and if we return VM_FAULT_FALLBACK
> we may fault in wrong PTE because of this. So dax_iomap_pmd_fault() should
> not modify the passed gfp_mask, just make its callers clear __GFP_FS from
> it because *they* are responsible for acquiring locks / transactions that
> block __GFP_FS allocations. They are also responsible for restoring
> original gfp_mask once dax_iomap_pmd_fault() returns.

Ok will fix.

> 
> dax_iomap_pmd_fault() needs to modify pgoff however it must restore it to
> the original value before it returns.

Need clarification here. Do you mean "If" dax_iomap_pmd_fault() needs to
modify.... and right now it doesn't appear to need to modify pgoff so
nothing needs to be done? Thanks.

> 
> Otherwise the patch looks good to me.
> 
> 								Honza
> 
>>  
>>  	switch (iomap.type) {
>>  	case IOMAP_MAPPED:
>> -		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
>> -				&iomap, pos, write, &entry);
>> +		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
>> +				vmf->address, &iomap, pos, write, &entry);
>>  		break;
>>  	case IOMAP_UNWRITTEN:
>>  	case IOMAP_HOLE:
>>  		if (WARN_ON_ONCE(write))
>>  			goto unlock_entry;
>> -		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
>> -				&entry);
>> +		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
>> +				&iomap, &entry);
>>  		break;
>>  	default:
>>  		WARN_ON_ONCE(1);
>> @@ -1417,12 +1415,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>>  	}
>>   fallback:
>>  	if (result == VM_FAULT_FALLBACK) {
>> -		split_huge_pmd(vma, pmd, address);
>> +		split_huge_pmd(vma, vmf->pmd, vmf->address);
>>  		count_vm_event(THP_FAULT_FALLBACK);
>>  	}
>>  out:
>> -	trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff,
>> -			result);
>> +	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
>>  	return result;
>>  }
>>  EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index d663d3d..10b64ba 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -275,21 +275,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>>  	return result;
>>  }
>>  
>> -static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
>> -						pmd_t *pmd, unsigned int flags)
>> +static int
>> +ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>>  {
>>  	int result;
>>  	struct inode *inode = file_inode(vma->vm_file);
>>  	struct super_block *sb = inode->i_sb;
>> -	bool write = flags & FAULT_FLAG_WRITE;
>> +	bool write = vmf->flags & FAULT_FLAG_WRITE;
>>  
>>  	if (write) {
>>  		sb_start_pagefault(sb);
>>  		file_update_time(vma->vm_file);
>>  	}
>>  	down_read(&EXT4_I(inode)->i_mmap_sem);
>> -	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
>> -				     &ext4_iomap_ops);
>> +	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
>>  	up_read(&EXT4_I(inode)->i_mmap_sem);
>>  	if (write)
>>  		sb_end_pagefault(sb);
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index d818c16..df0009f 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -1526,9 +1526,7 @@ xfs_filemap_fault(
>>  STATIC int
>>  xfs_filemap_pmd_fault(
>>  	struct vm_area_struct	*vma,
>> -	unsigned long		addr,
>> -	pmd_t			*pmd,
>> -	unsigned int		flags)
>> +	struct vm_fault *vmf)
>>  {
>>  	struct inode		*inode = file_inode(vma->vm_file);
>>  	struct xfs_inode	*ip = XFS_I(inode);
>> @@ -1539,16 +1537,16 @@ xfs_filemap_pmd_fault(
>>  
>>  	trace_xfs_filemap_pmd_fault(ip);
>>  
>> -	if (flags & FAULT_FLAG_WRITE) {
>> +	if (vmf->flags & FAULT_FLAG_WRITE) {
>>  		sb_start_pagefault(inode->i_sb);
>>  		file_update_time(vma->vm_file);
>>  	}
>>  
>>  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>> -	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
>> +	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
>>  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>>  
>> -	if (flags & FAULT_FLAG_WRITE)
>> +	if (vmf->flags & FAULT_FLAG_WRITE)
>>  		sb_end_pagefault(inode->i_sb);
>>  
>>  	return ret;
>> diff --git a/include/linux/dax.h b/include/linux/dax.h
>> index 6e36b11..9761c90 100644
>> --- a/include/linux/dax.h
>> +++ b/include/linux/dax.h
>> @@ -71,16 +71,15 @@ static inline unsigned int dax_radix_order(void *entry)
>>  		return PMD_SHIFT - PAGE_SHIFT;
>>  	return 0;
>>  }
>> -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>> -		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
>> +int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>> +		struct iomap_ops *ops);
>>  #else
>>  static inline unsigned int dax_radix_order(void *entry)
>>  {
>>  	return 0;
>>  }
>>  static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
>> -		unsigned long address, pmd_t *pmd, unsigned int flags,
>> -		struct iomap_ops *ops)
>> +		struct vm_fault *vmf, struct iomap_ops *ops)
>>  {
>>  	return VM_FAULT_FALLBACK;
>>  }
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 30f416a..aef645b 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -347,8 +347,7 @@ struct vm_operations_struct {
>>  	void (*close)(struct vm_area_struct * area);
>>  	int (*mremap)(struct vm_area_struct * area);
>>  	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
>> -	int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
>> -						pmd_t *, unsigned int flags);
>> +	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
>>  	void (*map_pages)(struct vm_fault *vmf,
>>  			pgoff_t start_pgoff, pgoff_t end_pgoff);
>>  
>> diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
>> index c3b0aae..a98665b 100644
>> --- a/include/trace/events/fs_dax.h
>> +++ b/include/trace/events/fs_dax.h
>> @@ -8,9 +8,8 @@
>>  
>>  DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>>  	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
>> -		unsigned long address, unsigned int flags, pgoff_t pgoff,
>> -		pgoff_t max_pgoff, int result),
>> -	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result),
>> +		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
>> +	TP_ARGS(inode, vma, vmf, max_pgoff, result),
>>  	TP_STRUCT__entry(
>>  		__field(unsigned long, ino)
>>  		__field(unsigned long, vm_start)
>> @@ -29,9 +28,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>>  		__entry->vm_start = vma->vm_start;
>>  		__entry->vm_end = vma->vm_end;
>>  		__entry->vm_flags = vma->vm_flags;
>> -		__entry->address = address;
>> -		__entry->flags = flags;
>> -		__entry->pgoff = pgoff;
>> +		__entry->address = vmf->address;
>> +		__entry->flags = vmf->flags;
>> +		__entry->pgoff = vmf->pgoff;
>>  		__entry->max_pgoff = max_pgoff;
>>  		__entry->result = result;
>>  	),
>> @@ -54,9 +53,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
>>  #define DEFINE_PMD_FAULT_EVENT(name) \
>>  DEFINE_EVENT(dax_pmd_fault_class, name, \
>>  	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
>> -		unsigned long address, unsigned int flags, pgoff_t pgoff, \
>> +		struct vm_fault *vmf, \
>>  		pgoff_t max_pgoff, int result), \
>> -	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result))
>> +	TP_ARGS(inode, vma, vmf, max_pgoff, result))
>>  
>>  DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
>>  DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
>> diff --git a/mm/memory.c b/mm/memory.c
>> index e37250f..8ec36cf 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3447,8 +3447,7 @@ static int create_huge_pmd(struct vm_fault *vmf)
>>  	if (vma_is_anonymous(vma))
>>  		return do_huge_pmd_anonymous_page(vmf);
>>  	if (vma->vm_ops->pmd_fault)
>> -		return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
>> -				vmf->flags);
>> +		return vma->vm_ops->pmd_fault(vma, vmf);
>>  	return VM_FAULT_FALLBACK;
>>  }
>>  
>> @@ -3457,8 +3456,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
>>  	if (vma_is_anonymous(vmf->vma))
>>  		return do_huge_pmd_wp_page(vmf, orig_pmd);
>>  	if (vmf->vma->vm_ops->pmd_fault)
>> -		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
>> -				vmf->pmd, vmf->flags);
>> +		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
>>  
>>  	/* COW handled on pte level: split pmd */
>>  	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
>>
Jan Kara Dec. 14, 2016, 9:57 a.m. UTC | #3
On Tue 13-12-16 11:29:54, Dave Jiang wrote:
> 
> 
> On 12/13/2016 05:15 AM, Jan Kara wrote:
> > On Thu 08-12-16 14:34:21, Dave Jiang wrote:
> >> Instead of passing in multiple parameters in the pmd_fault() handler,
> >> a vmf can be passed in just like a fault() handler. This will simplify
> >> code and remove the need for the actual pmd fault handlers to allocate a
> >> vmf. Related functions are also modified to do the same.
> >>
> >> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> >> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > 
> > I like the idea however see below:
> > 
> >> @@ -1377,21 +1376,20 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> >>  	if (iomap.offset + iomap.length < pos + PMD_SIZE)
> >>  		goto unlock_entry;
> >>  
> >> -	vmf.pgoff = pgoff;
> >> -	vmf.flags = flags;
> >> -	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
> >> +	vmf->pgoff = pgoff;
> >> +	vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
> > 
> > But now it's really unexpected that you change pgoff and gfp_mask because
> > that will propagate back to the caller and if we return VM_FAULT_FALLBACK
> > we may fault in wrong PTE because of this. So dax_iomap_pmd_fault() should
> > not modify the passed gfp_mask, just make its callers clear __GFP_FS from
> > it because *they* are responsible for acquiring locks / transactions that
> > block __GFP_FS allocations. They are also responsible for restoring
> > original gfp_mask once dax_iomap_pmd_fault() returns.
> 
> Ok will fix.
> 
> > 
> > dax_iomap_pmd_fault() needs to modify pgoff however it must restore it to
> > the original value before it returns.
> 
> Need clarification here. Do you mean "If" dax_iomap_pmd_fault() needs to
> modify.... and right now it doesn't appear to need to modify pgoff so
> nothing needs to be done? Thanks.

How come? I can see:

	pgoff = linear_page_index(vma, pmd_addr);

a few lines above - we need to modify pgoff to contain huge page aligned
file index instead of only page aligned...

								Honza
Dave Jiang Dec. 14, 2016, 4:57 p.m. UTC | #4
On 12/14/2016 02:57 AM, Jan Kara wrote:
> On Tue 13-12-16 11:29:54, Dave Jiang wrote:
>>
>>
>> On 12/13/2016 05:15 AM, Jan Kara wrote:
>>> On Thu 08-12-16 14:34:21, Dave Jiang wrote:
>>>> Instead of passing in multiple parameters in the pmd_fault() handler,
>>>> a vmf can be passed in just like a fault() handler. This will simplify
>>>> code and remove the need for the actual pmd fault handlers to allocate a
>>>> vmf. Related functions are also modified to do the same.
>>>>
>>>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>>>> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
>>>
>>> I like the idea however see below:
>>>
>>>> @@ -1377,21 +1376,20 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
>>>>  	if (iomap.offset + iomap.length < pos + PMD_SIZE)
>>>>  		goto unlock_entry;
>>>>  
>>>> -	vmf.pgoff = pgoff;
>>>> -	vmf.flags = flags;
>>>> -	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
>>>> +	vmf->pgoff = pgoff;
>>>> +	vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
>>>
>>> But now it's really unexpected that you change pgoff and gfp_mask because
>>> that will propagate back to the caller and if we return VM_FAULT_FALLBACK
>>> we may fault in wrong PTE because of this. So dax_iomap_pmd_fault() should
>>> not modify the passed gfp_mask, just make its callers clear __GFP_FS from
>>> it because *they* are responsible for acquiring locks / transactions that
>>> block __GFP_FS allocations. They are also responsible for restoring
>>> original gfp_mask once dax_iomap_pmd_fault() returns.
>>
>> Ok will fix.
>>
>>>
>>> dax_iomap_pmd_fault() needs to modify pgoff however it must restore it to
>>> the original value before it returns.
>>
>> Need clarification here. Do you mean "If" dax_iomap_pmd_fault() needs to
>> modify.... and right now it doesn't appear to need to modify pgoff so
>> nothing needs to be done? Thanks.
> 
> How come? I can see:
> 
> 	pgoff = linear_page_index(vma, pmd_addr);
> 
> a few lines above - we need to modify pgoff to contain huge page aligned
> file index instead of only page aligned...
> 
> 								Honza
> 

Yep. My mistake. I misunderstood. Will fix.
diff mbox

Patch

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index c753a4c..947e49a 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -379,10 +379,9 @@  static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
-		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
-		unsigned int flags)
+		struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	unsigned long pmd_addr = addr & PMD_MASK;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct device *dev = &dax_dev->dev;
 	struct dax_region *dax_region;
 	phys_addr_t phys;
@@ -414,23 +413,22 @@  static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
-			flags & FAULT_FLAG_WRITE);
+	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, unsigned int flags)
+static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int rc;
 	struct file *filp = vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
 
 	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
-			current->comm, (flags & FAULT_FLAG_WRITE)
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
 			? "write" : "read", vma->vm_start, vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
+	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/fs/dax.c b/fs/dax.c
index d3fe880..21ebe5b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1310,18 +1310,17 @@  static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
-	bool write = flags & FAULT_FLAG_WRITE;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
 	struct iomap iomap = { 0 };
 	pgoff_t max_pgoff, pgoff;
-	struct vm_fault vmf;
 	void *entry;
 	loff_t pos;
 	int error;
@@ -1334,7 +1333,7 @@  int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	pgoff = linear_page_index(vma, pmd_addr);
 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-	trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0);
+	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
@@ -1377,21 +1376,20 @@  int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
 		goto unlock_entry;
 
-	vmf.pgoff = pgoff;
-	vmf.flags = flags;
-	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
+	vmf->pgoff = pgoff;
+	vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
-				&iomap, pos, write, &entry);
+		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
+				vmf->address, &iomap, pos, write, &entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			goto unlock_entry;
-		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
-				&entry);
+		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
+				&iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1417,12 +1415,11 @@  int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
  fallback:
 	if (result == VM_FAULT_FALLBACK) {
-		split_huge_pmd(vma, pmd, address);
+		split_huge_pmd(vma, vmf->pmd, vmf->address);
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
 out:
-	trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff,
-			result);
+	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d663d3d..10b64ba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -275,21 +275,20 @@  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return result;
 }
 
-static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-						pmd_t *pmd, unsigned int flags)
+static int
+ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int result;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
-	bool write = flags & FAULT_FLAG_WRITE;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
-				     &ext4_iomap_ops);
+	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d818c16..df0009f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1526,9 +1526,7 @@  xfs_filemap_fault(
 STATIC int
 xfs_filemap_pmd_fault(
 	struct vm_area_struct	*vma,
-	unsigned long		addr,
-	pmd_t			*pmd,
-	unsigned int		flags)
+	struct vm_fault *vmf)
 {
 	struct inode		*inode = file_inode(vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1539,16 +1537,16 @@  xfs_filemap_pmd_fault(
 
 	trace_xfs_filemap_pmd_fault(ip);
 
-	if (flags & FAULT_FLAG_WRITE) {
+	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
 		file_update_time(vma->vm_file);
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
+	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
-	if (flags & FAULT_FLAG_WRITE)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(inode->i_sb);
 
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 6e36b11..9761c90 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -71,16 +71,15 @@  static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
 static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
-		unsigned long address, pmd_t *pmd, unsigned int flags,
-		struct iomap_ops *ops)
+		struct vm_fault *vmf, struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 30f416a..aef645b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -347,8 +347,7 @@  struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
-						pmd_t *, unsigned int flags);
+	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index c3b0aae..a98665b 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -8,9 +8,8 @@ 
 
 DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, unsigned int flags, pgoff_t pgoff,
-		pgoff_t max_pgoff, int result),
-	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result),
+		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vma, vmf, max_pgoff, result),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_start)
@@ -29,9 +28,9 @@  DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 		__entry->vm_start = vma->vm_start;
 		__entry->vm_end = vma->vm_end;
 		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
-		__entry->flags = flags;
-		__entry->pgoff = pgoff;
+		__entry->address = vmf->address;
+		__entry->flags = vmf->flags;
+		__entry->pgoff = vmf->pgoff;
 		__entry->max_pgoff = max_pgoff;
 		__entry->result = result;
 	),
@@ -54,9 +53,9 @@  DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 #define DEFINE_PMD_FAULT_EVENT(name) \
 DEFINE_EVENT(dax_pmd_fault_class, name, \
 	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, unsigned int flags, pgoff_t pgoff, \
+		struct vm_fault *vmf, \
 		pgoff_t max_pgoff, int result), \
-	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result))
+	TP_ARGS(inode, vma, vmf, max_pgoff, result))
 
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
diff --git a/mm/memory.c b/mm/memory.c
index e37250f..8ec36cf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3447,8 +3447,7 @@  static int create_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vma))
 		return do_huge_pmd_anonymous_page(vmf);
 	if (vma->vm_ops->pmd_fault)
-		return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
-				vmf->flags);
+		return vma->vm_ops->pmd_fault(vma, vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3457,8 +3456,7 @@  static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
-				vmf->pmd, vmf->flags);
+		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);