@@ -148,7 +148,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
/* If for any reason at all we couldn't handle the fault,
make sure we exit gracefully rather than endlessly redo
the fault. */
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -130,7 +130,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
goto bad_area;
}
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
@@ -224,7 +224,7 @@ __do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
goto out;
}
- return handle_mm_fault(vma, addr & PAGE_MASK, flags);
+ return handle_mm_fault(vma, addr & PAGE_MASK, flags, NULL);
check_stack:
/* Don't allow expansion below FIRST_USER_ADDRESS */
@@ -428,7 +428,7 @@ static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
*/
if (!(vma->vm_flags & vm_flags))
return VM_FAULT_BADACCESS;
- return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags);
+ return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, NULL);
}
static bool is_el0_instruction_abort(unsigned int esr)
@@ -150,7 +150,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, write ? FAULT_FLAG_WRITE : 0);
+ fault = handle_mm_fault(vma, address, write ? FAULT_FLAG_WRITE : 0,
+ NULL);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
@@ -88,7 +88,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
break;
}
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -143,7 +143,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
* sure we exit gracefully rather than endlessly redo the
* fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -134,7 +134,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
pr_debug("handle_mm_fault returns %x\n", fault);
if (fault_signal_pending(fault, regs))
@@ -214,7 +214,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -152,7 +152,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -206,7 +206,7 @@ void do_page_fault(unsigned long entry, unsigned long addr,
* the fault.
*/
- fault = handle_mm_fault(vma, addr, flags);
+ fault = handle_mm_fault(vma, addr, flags, NULL);
/*
* If we need to retry but a fatal signal is pending, handle the
@@ -131,7 +131,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -159,7 +159,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -302,7 +302,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
* fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -64,7 +64,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
}
ret = 0;
- *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
+ *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL);
if (unlikely(*flt & VM_FAULT_ERROR)) {
if (*flt & VM_FAULT_OOM) {
ret = -ENOMEM;
@@ -607,7 +607,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
major |= fault & VM_FAULT_MAJOR;
@@ -109,7 +109,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, addr, flags);
+ fault = handle_mm_fault(vma, addr, flags, NULL);
/*
* If we need to retry but a fatal signal is pending, handle the
@@ -478,7 +478,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs)) {
fault = VM_FAULT_SIGNAL;
if (flags & FAULT_FLAG_RETRY_NOWAIT)
@@ -482,7 +482,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR)))
if (mm_fault_error(regs, error_code, address, fault))
@@ -234,7 +234,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -410,7 +410,7 @@ static void force_user_fault(unsigned long address, int write)
if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto bad_area;
}
- switch (handle_mm_fault(vma, address, flags)) {
+ switch (handle_mm_fault(vma, address, flags, NULL)) {
case VM_FAULT_SIGBUS:
case VM_FAULT_OOM:
goto do_sigbus;
@@ -422,7 +422,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
goto bad_area;
}
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
goto exit_exception;
@@ -71,7 +71,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
do {
vm_fault_t fault;
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
goto out_nosemaphore;
@@ -1291,7 +1291,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* userland). The return to userland is identified whenever
* FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
major |= fault & VM_FAULT_MAJOR;
/* Quick path to respond to signals */
@@ -107,7 +107,7 @@ void do_page_fault(struct pt_regs *regs)
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags, NULL);
if (fault_signal_pending(fault, regs))
return;
@@ -495,7 +495,7 @@ static void do_fault(struct work_struct *work)
if (access_error(vma, fault))
goto out;
- ret = handle_mm_fault(vma, address, flags);
+ ret = handle_mm_fault(vma, address, flags, NULL);
out:
mmap_read_unlock(mm);
@@ -872,7 +872,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
goto invalid;
ret = handle_mm_fault(vma, address,
- req->wr_req ? FAULT_FLAG_WRITE : 0);
+ req->wr_req ? FAULT_FLAG_WRITE : 0,
+ NULL);
if (ret & VM_FAULT_ERROR)
goto invalid;
@@ -39,6 +39,7 @@ struct file_ra_state;
struct user_struct;
struct writeback_control;
struct bdi_writeback;
+struct pt_regs;
void init_mm_internals(void);
@@ -1659,7 +1660,8 @@ int invalidate_inode_page(struct page *page);
#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags);
+ unsigned long address, unsigned int flags,
+ struct pt_regs *regs);
extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
@@ -1669,7 +1671,8 @@ void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+ unsigned long address, unsigned int flags,
+ struct pt_regs *regs)
{
/* should never happen if there's no MMU */
BUG();
@@ -884,7 +884,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_TRIED;
}
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -1238,7 +1238,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
fatal_signal_pending(current))
return -EINTR;
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -75,7 +75,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
}
for (; addr < end; addr += PAGE_SIZE)
- if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR)
+ if (handle_mm_fault(vma, addr, fault_flags, NULL) &
+ VM_FAULT_ERROR)
return -EFAULT;
return -EBUSY;
}
@@ -480,7 +480,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ NULL);
else
ret = VM_FAULT_WRITE;
put_page(page);
@@ -71,6 +71,8 @@
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
#include <trace/events/kmem.h>
@@ -4360,6 +4362,64 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return handle_pte_fault(&vmf);
}
+/**
+ * mm_account_fault - Do page fault accountings
+ *
+ * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
+ * of perf event counters, but we'll still do the per-task accounting to
+ * the task who triggered this page fault.
+ * @address: the faulted address.
+ * @flags: the fault flags.
+ * @ret: the fault retcode.
+ *
+ * This will take care of most of the page fault accountings. Meanwhile, it
+ * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
+ * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * still be in per-arch page fault handlers at the entry of page fault.
+ */
+static inline void mm_account_fault(struct pt_regs *regs,
+ unsigned long address, unsigned int flags,
+ vm_fault_t ret)
+{
+ bool major;
+
+ /*
+ * We don't do accounting for some specific faults:
+ *
+ * - Unsuccessful faults (e.g. when the address wasn't valid). That
+ * includes arch_vma_access_permitted() failing before reaching here.
+ * So this is not a "this many hardware page faults" counter. We
+ * should use the hw profiling for that.
+ *
+ * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
+ * once they're completed.
+ */
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+ return;
+
+ /*
+ * We define the fault as a major fault when the final successful fault
+ * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
+ * handle it immediately previously).
+ */
+ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
+
+ /*
+ * If the fault is done for GUP, regs will be NULL, and we will skip
+ * the fault accounting.
+ */
+ if (!regs)
+ return;
+
+ if (major) {
+ current->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ } else {
+ current->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+ }
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -4367,7 +4427,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+ unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
@@ -4408,6 +4468,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false);
}
+ mm_account_fault(regs, address, flags, ret);
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
This is a preparation patch to move page fault accountings into the general code in handle_mm_fault(). This includes both the per task flt_maj/flt_min counters, and the major/minor page fault perf events. To do this, the pt_regs pointer is passed into handle_mm_fault(). PERF_COUNT_SW_PAGE_FAULTS should still be kept in per-arch page fault handlers. So far, all the pt_regs pointer that passed into handle_mm_fault() is NULL, which means this patch should have no intented functional change. Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Xu <peterx@redhat.com> --- arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c | 2 +- arch/arm/mm/fault.c | 2 +- arch/arm64/mm/fault.c | 2 +- arch/csky/mm/fault.c | 3 +- arch/hexagon/mm/vm_fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/nds32/mm/fault.c | 2 +- arch/nios2/mm/fault.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/mm/copro_fault.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/riscv/mm/fault.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/sh/mm/fault.c | 2 +- arch/sparc/mm/fault_32.c | 4 +-- arch/sparc/mm/fault_64.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/x86/mm/fault.c | 2 +- arch/xtensa/mm/fault.c | 2 +- drivers/iommu/amd/iommu_v2.c | 2 +- drivers/iommu/intel/svm.c | 3 +- include/linux/mm.h | 7 ++-- mm/gup.c | 4 +-- mm/hmm.c | 3 +- mm/ksm.c | 3 +- mm/memory.c | 64 ++++++++++++++++++++++++++++++++++- 31 files changed, 103 insertions(+), 34 deletions(-)