Message ID | 148123286127.108913.2695398781030517780.stgit@djiang5-desk3.ch.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu 08-12-16 14:34:21, Dave Jiang wrote: > Instead of passing in multiple parameters in the pmd_fault() handler, > a vmf can be passed in just like a fault() handler. This will simplify > code and remove the need for the actual pmd fault handlers to allocate a > vmf. Related functions are also modified to do the same. > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> I like the idea however see below: > @@ -1377,21 +1376,20 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, > if (iomap.offset + iomap.length < pos + PMD_SIZE) > goto unlock_entry; > > - vmf.pgoff = pgoff; > - vmf.flags = flags; > - vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; > + vmf->pgoff = pgoff; > + vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; But now it's really unexpected that you change pgoff and gfp_mask because that will propagate back to the caller and if we return VM_FAULT_FALLBACK we may fault in wrong PTE because of this. So dax_iomap_pmd_fault() should not modify the passed gfp_mask, just make its callers clear __GFP_FS from it because *they* are responsible for acquiring locks / transactions that block __GFP_FS allocations. They are also responsible for restoring original gfp_mask once dax_iomap_pmd_fault() returns. dax_iomap_pmd_fault() needs to modify pgoff however it must restore it to the original value before it returns. Otherwise the patch looks good to me. Honza > > switch (iomap.type) { > case IOMAP_MAPPED: > - result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, > - &iomap, pos, write, &entry); > + result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf, > + vmf->address, &iomap, pos, write, &entry); > break; > case IOMAP_UNWRITTEN: > case IOMAP_HOLE: > if (WARN_ON_ONCE(write)) > goto unlock_entry; > - result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, > - &entry); > + result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address, > + &iomap, &entry); > break; > default: > WARN_ON_ONCE(1); > @@ -1417,12 +1415,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, > } > fallback: > if (result == VM_FAULT_FALLBACK) { > - split_huge_pmd(vma, pmd, address); > + split_huge_pmd(vma, vmf->pmd, vmf->address); > count_vm_event(THP_FAULT_FALLBACK); > } > out: > - trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff, > - result); > + trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result); > return result; > } > EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); > diff --git a/fs/ext4/file.c b/fs/ext4/file.c > index d663d3d..10b64ba 100644 > --- a/fs/ext4/file.c > +++ b/fs/ext4/file.c > @@ -275,21 +275,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > return result; > } > > -static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, > - pmd_t *pmd, unsigned int flags) > +static int > +ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > { > int result; > struct inode *inode = file_inode(vma->vm_file); > struct super_block *sb = inode->i_sb; > - bool write = flags & FAULT_FLAG_WRITE; > + bool write = vmf->flags & FAULT_FLAG_WRITE; > > if (write) { > sb_start_pagefault(sb); > file_update_time(vma->vm_file); > } > down_read(&EXT4_I(inode)->i_mmap_sem); > - result = dax_iomap_pmd_fault(vma, addr, pmd, flags, > - &ext4_iomap_ops); > + result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops); > up_read(&EXT4_I(inode)->i_mmap_sem); > if (write) > sb_end_pagefault(sb); > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index d818c16..df0009f 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -1526,9 +1526,7 @@ xfs_filemap_fault( > STATIC int > xfs_filemap_pmd_fault( > struct vm_area_struct *vma, > - unsigned long addr, > - pmd_t *pmd, > - unsigned int flags) > + struct vm_fault *vmf) > { > struct inode *inode = file_inode(vma->vm_file); > struct xfs_inode *ip = XFS_I(inode); > @@ -1539,16 +1537,16 @@ xfs_filemap_pmd_fault( > > trace_xfs_filemap_pmd_fault(ip); > > - if (flags & FAULT_FLAG_WRITE) { > + if (vmf->flags & FAULT_FLAG_WRITE) { > sb_start_pagefault(inode->i_sb); > file_update_time(vma->vm_file); > } > > xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > - ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); > + ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops); > xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); > > - if (flags & FAULT_FLAG_WRITE) > + if (vmf->flags & FAULT_FLAG_WRITE) > sb_end_pagefault(inode->i_sb); > > return ret; > diff --git a/include/linux/dax.h b/include/linux/dax.h > index 6e36b11..9761c90 100644 > --- a/include/linux/dax.h > +++ b/include/linux/dax.h > @@ -71,16 +71,15 @@ static inline unsigned int dax_radix_order(void *entry) > return PMD_SHIFT - PAGE_SHIFT; > return 0; > } > -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, > - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops); > +int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, > + struct iomap_ops *ops); > #else > static inline unsigned int dax_radix_order(void *entry) > { > return 0; > } > static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, > - unsigned long address, pmd_t *pmd, unsigned int flags, > - struct iomap_ops *ops) > + struct vm_fault *vmf, struct iomap_ops *ops) > { > return VM_FAULT_FALLBACK; > } > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 30f416a..aef645b 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -347,8 +347,7 @@ struct vm_operations_struct { > void (*close)(struct vm_area_struct * area); > int (*mremap)(struct vm_area_struct * area); > int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); > - int (*pmd_fault)(struct vm_area_struct *, unsigned long address, > - pmd_t *, unsigned int flags); > + int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf); > void (*map_pages)(struct vm_fault *vmf, > pgoff_t start_pgoff, pgoff_t end_pgoff); > > diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h > index c3b0aae..a98665b 100644 > --- a/include/trace/events/fs_dax.h > +++ b/include/trace/events/fs_dax.h > @@ -8,9 +8,8 @@ > > DECLARE_EVENT_CLASS(dax_pmd_fault_class, > TP_PROTO(struct inode *inode, struct vm_area_struct *vma, > - unsigned long address, unsigned int flags, pgoff_t pgoff, > - pgoff_t max_pgoff, int result), > - TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result), > + struct vm_fault *vmf, pgoff_t max_pgoff, int result), > + TP_ARGS(inode, vma, vmf, max_pgoff, result), > TP_STRUCT__entry( > __field(unsigned long, ino) > __field(unsigned long, vm_start) > @@ -29,9 +28,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, > __entry->vm_start = vma->vm_start; > __entry->vm_end = vma->vm_end; > __entry->vm_flags = vma->vm_flags; > - __entry->address = address; > - __entry->flags = flags; > - __entry->pgoff = pgoff; > + __entry->address = vmf->address; > + __entry->flags = vmf->flags; > + __entry->pgoff = vmf->pgoff; > __entry->max_pgoff = max_pgoff; > __entry->result = result; > ), > @@ -54,9 +53,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, > #define DEFINE_PMD_FAULT_EVENT(name) \ > DEFINE_EVENT(dax_pmd_fault_class, name, \ > TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ > - unsigned long address, unsigned int flags, pgoff_t pgoff, \ > + struct vm_fault *vmf, \ > pgoff_t max_pgoff, int result), \ > - TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result)) > + TP_ARGS(inode, vma, vmf, max_pgoff, result)) > > DEFINE_PMD_FAULT_EVENT(dax_pmd_fault); > DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done); > diff --git a/mm/memory.c b/mm/memory.c > index e37250f..8ec36cf 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3447,8 +3447,7 @@ static int create_huge_pmd(struct vm_fault *vmf) > if (vma_is_anonymous(vma)) > return do_huge_pmd_anonymous_page(vmf); > if (vma->vm_ops->pmd_fault) > - return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, > - vmf->flags); > + return vma->vm_ops->pmd_fault(vma, vmf); > return VM_FAULT_FALLBACK; > } > > @@ -3457,8 +3456,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) > if (vma_is_anonymous(vmf->vma)) > return do_huge_pmd_wp_page(vmf, orig_pmd); > if (vmf->vma->vm_ops->pmd_fault) > - return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, > - vmf->pmd, vmf->flags); > + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf); > > /* COW handled on pte level: split pmd */ > VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); >
On 12/13/2016 05:15 AM, Jan Kara wrote: > On Thu 08-12-16 14:34:21, Dave Jiang wrote: >> Instead of passing in multiple parameters in the pmd_fault() handler, >> a vmf can be passed in just like a fault() handler. This will simplify >> code and remove the need for the actual pmd fault handlers to allocate a >> vmf. Related functions are also modified to do the same. >> >> Signed-off-by: Dave Jiang <dave.jiang@intel.com> >> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > I like the idea however see below: > >> @@ -1377,21 +1376,20 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, >> if (iomap.offset + iomap.length < pos + PMD_SIZE) >> goto unlock_entry; >> >> - vmf.pgoff = pgoff; >> - vmf.flags = flags; >> - vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; >> + vmf->pgoff = pgoff; >> + vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; > > But now it's really unexpected that you change pgoff and gfp_mask because > that will propagate back to the caller and if we return VM_FAULT_FALLBACK > we may fault in wrong PTE because of this. So dax_iomap_pmd_fault() should > not modify the passed gfp_mask, just make its callers clear __GFP_FS from > it because *they* are responsible for acquiring locks / transactions that > block __GFP_FS allocations. They are also responsible for restoring > original gfp_mask once dax_iomap_pmd_fault() returns. Ok will fix. > > dax_iomap_pmd_fault() needs to modify pgoff however it must restore it to > the original value before it returns. Need clarification here. Do you mean "If" dax_iomap_pmd_fault() needs to modify.... and right now it doesn't appear to need to modify pgoff so nothing needs to be done? Thanks. > > Otherwise the patch looks good to me. > > Honza > >> >> switch (iomap.type) { >> case IOMAP_MAPPED: >> - result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, >> - &iomap, pos, write, &entry); >> + result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf, >> + vmf->address, &iomap, pos, write, &entry); >> break; >> case IOMAP_UNWRITTEN: >> case IOMAP_HOLE: >> if (WARN_ON_ONCE(write)) >> goto unlock_entry; >> - result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, >> - &entry); >> + result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address, >> + &iomap, &entry); >> break; >> default: >> WARN_ON_ONCE(1); >> @@ -1417,12 +1415,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, >> } >> fallback: >> if (result == VM_FAULT_FALLBACK) { >> - split_huge_pmd(vma, pmd, address); >> + split_huge_pmd(vma, vmf->pmd, vmf->address); >> count_vm_event(THP_FAULT_FALLBACK); >> } >> out: >> - trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff, >> - result); >> + trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result); >> return result; >> } >> EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); >> diff --git a/fs/ext4/file.c b/fs/ext4/file.c >> index d663d3d..10b64ba 100644 >> --- a/fs/ext4/file.c >> +++ b/fs/ext4/file.c >> @@ -275,21 +275,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) >> return result; >> } >> >> -static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, >> - pmd_t *pmd, unsigned int flags) >> +static int >> +ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) >> { >> int result; >> struct inode *inode = file_inode(vma->vm_file); >> struct super_block *sb = inode->i_sb; >> - bool write = flags & FAULT_FLAG_WRITE; >> + bool write = vmf->flags & FAULT_FLAG_WRITE; >> >> if (write) { >> sb_start_pagefault(sb); >> file_update_time(vma->vm_file); >> } >> down_read(&EXT4_I(inode)->i_mmap_sem); >> - result = dax_iomap_pmd_fault(vma, addr, pmd, flags, >> - &ext4_iomap_ops); >> + result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops); >> up_read(&EXT4_I(inode)->i_mmap_sem); >> if (write) >> sb_end_pagefault(sb); >> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c >> index d818c16..df0009f 100644 >> --- a/fs/xfs/xfs_file.c >> +++ b/fs/xfs/xfs_file.c >> @@ -1526,9 +1526,7 @@ xfs_filemap_fault( >> STATIC int >> xfs_filemap_pmd_fault( >> struct vm_area_struct *vma, >> - unsigned long addr, >> - pmd_t *pmd, >> - unsigned int flags) >> + struct vm_fault *vmf) >> { >> struct inode *inode = file_inode(vma->vm_file); >> struct xfs_inode *ip = XFS_I(inode); >> @@ -1539,16 +1537,16 @@ xfs_filemap_pmd_fault( >> >> trace_xfs_filemap_pmd_fault(ip); >> >> - if (flags & FAULT_FLAG_WRITE) { >> + if (vmf->flags & FAULT_FLAG_WRITE) { >> sb_start_pagefault(inode->i_sb); >> file_update_time(vma->vm_file); >> } >> >> xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); >> - ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); >> + ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops); >> xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); >> >> - if (flags & FAULT_FLAG_WRITE) >> + if (vmf->flags & FAULT_FLAG_WRITE) >> sb_end_pagefault(inode->i_sb); >> >> return ret; >> diff --git a/include/linux/dax.h b/include/linux/dax.h >> index 6e36b11..9761c90 100644 >> --- a/include/linux/dax.h >> +++ b/include/linux/dax.h >> @@ -71,16 +71,15 @@ static inline unsigned int dax_radix_order(void *entry) >> return PMD_SHIFT - PAGE_SHIFT; >> return 0; >> } >> -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, >> - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops); >> +int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, >> + struct iomap_ops *ops); >> #else >> static inline unsigned int dax_radix_order(void *entry) >> { >> return 0; >> } >> static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, >> - unsigned long address, pmd_t *pmd, unsigned int flags, >> - struct iomap_ops *ops) >> + struct vm_fault *vmf, struct iomap_ops *ops) >> { >> return VM_FAULT_FALLBACK; >> } >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index 30f416a..aef645b 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -347,8 +347,7 @@ struct vm_operations_struct { >> void (*close)(struct vm_area_struct * area); >> int (*mremap)(struct vm_area_struct * area); >> int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); >> - int (*pmd_fault)(struct vm_area_struct *, unsigned long address, >> - pmd_t *, unsigned int flags); >> + int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf); >> void (*map_pages)(struct vm_fault *vmf, >> pgoff_t start_pgoff, pgoff_t end_pgoff); >> >> diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h >> index c3b0aae..a98665b 100644 >> --- a/include/trace/events/fs_dax.h >> +++ b/include/trace/events/fs_dax.h >> @@ -8,9 +8,8 @@ >> >> DECLARE_EVENT_CLASS(dax_pmd_fault_class, >> TP_PROTO(struct inode *inode, struct vm_area_struct *vma, >> - unsigned long address, unsigned int flags, pgoff_t pgoff, >> - pgoff_t max_pgoff, int result), >> - TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result), >> + struct vm_fault *vmf, pgoff_t max_pgoff, int result), >> + TP_ARGS(inode, vma, vmf, max_pgoff, result), >> TP_STRUCT__entry( >> __field(unsigned long, ino) >> __field(unsigned long, vm_start) >> @@ -29,9 +28,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, >> __entry->vm_start = vma->vm_start; >> __entry->vm_end = vma->vm_end; >> __entry->vm_flags = vma->vm_flags; >> - __entry->address = address; >> - __entry->flags = flags; >> - __entry->pgoff = pgoff; >> + __entry->address = vmf->address; >> + __entry->flags = vmf->flags; >> + __entry->pgoff = vmf->pgoff; >> __entry->max_pgoff = max_pgoff; >> __entry->result = result; >> ), >> @@ -54,9 +53,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, >> #define DEFINE_PMD_FAULT_EVENT(name) \ >> DEFINE_EVENT(dax_pmd_fault_class, name, \ >> TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ >> - unsigned long address, unsigned int flags, pgoff_t pgoff, \ >> + struct vm_fault *vmf, \ >> pgoff_t max_pgoff, int result), \ >> - TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result)) >> + TP_ARGS(inode, vma, vmf, max_pgoff, result)) >> >> DEFINE_PMD_FAULT_EVENT(dax_pmd_fault); >> DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done); >> diff --git a/mm/memory.c b/mm/memory.c >> index e37250f..8ec36cf 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -3447,8 +3447,7 @@ static int create_huge_pmd(struct vm_fault *vmf) >> if (vma_is_anonymous(vma)) >> return do_huge_pmd_anonymous_page(vmf); >> if (vma->vm_ops->pmd_fault) >> - return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, >> - vmf->flags); >> + return vma->vm_ops->pmd_fault(vma, vmf); >> return VM_FAULT_FALLBACK; >> } >> >> @@ -3457,8 +3456,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) >> if (vma_is_anonymous(vmf->vma)) >> return do_huge_pmd_wp_page(vmf, orig_pmd); >> if (vmf->vma->vm_ops->pmd_fault) >> - return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, >> - vmf->pmd, vmf->flags); >> + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf); >> >> /* COW handled on pte level: split pmd */ >> VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); >>
On Tue 13-12-16 11:29:54, Dave Jiang wrote: > > > On 12/13/2016 05:15 AM, Jan Kara wrote: > > On Thu 08-12-16 14:34:21, Dave Jiang wrote: > >> Instead of passing in multiple parameters in the pmd_fault() handler, > >> a vmf can be passed in just like a fault() handler. This will simplify > >> code and remove the need for the actual pmd fault handlers to allocate a > >> vmf. Related functions are also modified to do the same. > >> > >> Signed-off-by: Dave Jiang <dave.jiang@intel.com> > >> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > > > I like the idea however see below: > > > >> @@ -1377,21 +1376,20 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, > >> if (iomap.offset + iomap.length < pos + PMD_SIZE) > >> goto unlock_entry; > >> > >> - vmf.pgoff = pgoff; > >> - vmf.flags = flags; > >> - vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; > >> + vmf->pgoff = pgoff; > >> + vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; > > > > But now it's really unexpected that you change pgoff and gfp_mask because > > that will propagate back to the caller and if we return VM_FAULT_FALLBACK > > we may fault in wrong PTE because of this. So dax_iomap_pmd_fault() should > > not modify the passed gfp_mask, just make its callers clear __GFP_FS from > > it because *they* are responsible for acquiring locks / transactions that > > block __GFP_FS allocations. They are also responsible for restoring > > original gfp_mask once dax_iomap_pmd_fault() returns. > > Ok will fix. > > > > > dax_iomap_pmd_fault() needs to modify pgoff however it must restore it to > > the original value before it returns. > > Need clarification here. Do you mean "If" dax_iomap_pmd_fault() needs to > modify.... and right now it doesn't appear to need to modify pgoff so > nothing needs to be done? Thanks. How come? I can see: pgoff = linear_page_index(vma, pmd_addr); a few lines above - we need to modify pgoff to contain huge page aligned file index instead of only page aligned... Honza
On 12/14/2016 02:57 AM, Jan Kara wrote: > On Tue 13-12-16 11:29:54, Dave Jiang wrote: >> >> >> On 12/13/2016 05:15 AM, Jan Kara wrote: >>> On Thu 08-12-16 14:34:21, Dave Jiang wrote: >>>> Instead of passing in multiple parameters in the pmd_fault() handler, >>>> a vmf can be passed in just like a fault() handler. This will simplify >>>> code and remove the need for the actual pmd fault handlers to allocate a >>>> vmf. Related functions are also modified to do the same. >>>> >>>> Signed-off-by: Dave Jiang <dave.jiang@intel.com> >>>> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> >>> >>> I like the idea however see below: >>> >>>> @@ -1377,21 +1376,20 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, >>>> if (iomap.offset + iomap.length < pos + PMD_SIZE) >>>> goto unlock_entry; >>>> >>>> - vmf.pgoff = pgoff; >>>> - vmf.flags = flags; >>>> - vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; >>>> + vmf->pgoff = pgoff; >>>> + vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; >>> >>> But now it's really unexpected that you change pgoff and gfp_mask because >>> that will propagate back to the caller and if we return VM_FAULT_FALLBACK >>> we may fault in wrong PTE because of this. So dax_iomap_pmd_fault() should >>> not modify the passed gfp_mask, just make its callers clear __GFP_FS from >>> it because *they* are responsible for acquiring locks / transactions that >>> block __GFP_FS allocations. They are also responsible for restoring >>> original gfp_mask once dax_iomap_pmd_fault() returns. >> >> Ok will fix. >> >>> >>> dax_iomap_pmd_fault() needs to modify pgoff however it must restore it to >>> the original value before it returns. >> >> Need clarification here. Do you mean "If" dax_iomap_pmd_fault() needs to >> modify.... and right now it doesn't appear to need to modify pgoff so >> nothing needs to be done? Thanks. > > How come? I can see: > > pgoff = linear_page_index(vma, pmd_addr); > > a few lines above - we need to modify pgoff to contain huge page aligned > file index instead of only page aligned... > > Honza > Yep. My mistake. I misunderstood. Will fix.
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index c753a4c..947e49a 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -379,10 +379,9 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, - struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, - unsigned int flags) + struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long pmd_addr = addr & PMD_MASK; + unsigned long pmd_addr = vmf->address & PMD_MASK; struct device *dev = &dax_dev->dev; struct dax_region *dax_region; phys_addr_t phys; @@ -414,23 +413,22 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - return vmf_insert_pfn_pmd(vma, addr, pmd, pfn, - flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, + vmf->flags & FAULT_FLAG_WRITE); } -static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) +static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int rc; struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, - current->comm, (flags & FAULT_FLAG_WRITE) + current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); rcu_read_lock(); - rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags); + rc = __dax_dev_pmd_fault(dax_dev, vma, vmf); rcu_read_unlock(); return rc; diff --git a/fs/dax.c b/fs/dax.c index d3fe880..21ebe5b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1310,18 +1310,17 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, return VM_FAULT_FALLBACK; } -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) +int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; - unsigned long pmd_addr = address & PMD_MASK; - bool write = flags & FAULT_FLAG_WRITE; + unsigned long pmd_addr = vmf->address & PMD_MASK; + bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; - struct vm_fault vmf; void *entry; loff_t pos; int error; @@ -1334,7 +1333,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, pgoff = linear_page_index(vma, pmd_addr); max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; - trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0); + trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0); /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) @@ -1377,21 +1376,20 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (iomap.offset + iomap.length < pos + PMD_SIZE) goto unlock_entry; - vmf.pgoff = pgoff; - vmf.flags = flags; - vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; + vmf->pgoff = pgoff; + vmf->gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, - &iomap, pos, write, &entry); + result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf, + vmf->address, &iomap, pos, write, &entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) goto unlock_entry; - result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, - &entry); + result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address, + &iomap, &entry); break; default: WARN_ON_ONCE(1); @@ -1417,12 +1415,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, } fallback: if (result == VM_FAULT_FALLBACK) { - split_huge_pmd(vma, pmd, address); + split_huge_pmd(vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } out: - trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff, - result); + trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result); return result; } EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d663d3d..10b64ba 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -275,21 +275,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return result; } -static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) +static int +ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; struct inode *inode = file_inode(vma->vm_file); struct super_block *sb = inode->i_sb; - bool write = flags & FAULT_FLAG_WRITE; + bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); } down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_pmd_fault(vma, addr, pmd, flags, - &ext4_iomap_ops); + result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops); up_read(&EXT4_I(inode)->i_mmap_sem); if (write) sb_end_pagefault(sb); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d818c16..df0009f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1526,9 +1526,7 @@ xfs_filemap_fault( STATIC int xfs_filemap_pmd_fault( struct vm_area_struct *vma, - unsigned long addr, - pmd_t *pmd, - unsigned int flags) + struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); struct xfs_inode *ip = XFS_I(inode); @@ -1539,16 +1537,16 @@ xfs_filemap_pmd_fault( trace_xfs_filemap_pmd_fault(ip); - if (flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); + ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (flags & FAULT_FLAG_WRITE) + if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; diff --git a/include/linux/dax.h b/include/linux/dax.h index 6e36b11..9761c90 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -71,16 +71,15 @@ static inline unsigned int dax_radix_order(void *entry) return PMD_SHIFT - PAGE_SHIFT; return 0; } -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops); +int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops); #else static inline unsigned int dax_radix_order(void *entry) { return 0; } static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags, - struct iomap_ops *ops) + struct vm_fault *vmf, struct iomap_ops *ops) { return VM_FAULT_FALLBACK; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 30f416a..aef645b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -347,8 +347,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); - int (*pmd_fault)(struct vm_area_struct *, unsigned long address, - pmd_t *, unsigned int flags); + int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index c3b0aae..a98665b 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -8,9 +8,8 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, TP_PROTO(struct inode *inode, struct vm_area_struct *vma, - unsigned long address, unsigned int flags, pgoff_t pgoff, - pgoff_t max_pgoff, int result), - TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result), + struct vm_fault *vmf, pgoff_t max_pgoff, int result), + TP_ARGS(inode, vma, vmf, max_pgoff, result), TP_STRUCT__entry( __field(unsigned long, ino) __field(unsigned long, vm_start) @@ -29,9 +28,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, __entry->vm_start = vma->vm_start; __entry->vm_end = vma->vm_end; __entry->vm_flags = vma->vm_flags; - __entry->address = address; - __entry->flags = flags; - __entry->pgoff = pgoff; + __entry->address = vmf->address; + __entry->flags = vmf->flags; + __entry->pgoff = vmf->pgoff; __entry->max_pgoff = max_pgoff; __entry->result = result; ), @@ -54,9 +53,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class, #define DEFINE_PMD_FAULT_EVENT(name) \ DEFINE_EVENT(dax_pmd_fault_class, name, \ TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \ - unsigned long address, unsigned int flags, pgoff_t pgoff, \ + struct vm_fault *vmf, \ pgoff_t max_pgoff, int result), \ - TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result)) + TP_ARGS(inode, vma, vmf, max_pgoff, result)) DEFINE_PMD_FAULT_EVENT(dax_pmd_fault); DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done); diff --git a/mm/memory.c b/mm/memory.c index e37250f..8ec36cf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3447,8 +3447,7 @@ static int create_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, - vmf->flags); + return vma->vm_ops->pmd_fault(vma, vmf); return VM_FAULT_FALLBACK; } @@ -3457,8 +3456,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, - vmf->pmd, vmf->flags); + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf); /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);