diff mbox series

[RFC,v12,2/6] x86: mm: Skip faulting instruction for VM_DROPPABLE faults

Message ID 20221212185347.1286824-3-Jason@zx2c4.com (mailing list archive)
State New
Headers show
Series [RFC,v12,1/6] mm: add VM_DROPPABLE for designating always lazily freeable mappings | expand

Commit Message

Jason A. Donenfeld Dec. 12, 2022, 6:53 p.m. UTC
The prior commit introduced VM_DROPPABLE, but in a limited form where
the faulting instruction was retried instead of skipped. Finish that up
with the platform-specific aspect of skipping the actual instruction.

This works by copying userspace's %rip to a stack buffer of size
MAX_INSN_SIZE, decoding it, and then adding the length of the decoded
instruction to userspace's %rip. In the event any of these fail, just
fallback to not advancing %rip and trying again.

Cc: linux-mm@kvack.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/x86/mm/fault.c      | 19 +++++++++++++++++++
 include/linux/mm_types.h |  5 ++++-
 mm/memory.c              |  4 +++-
 3 files changed, 26 insertions(+), 2 deletions(-)

Comments

Jason A. Donenfeld Dec. 12, 2022, 9:01 p.m. UTC | #1
On Mon, Dec 12, 2022 at 11:53:43AM -0700, Jason A. Donenfeld wrote:
> +	if (fault & VM_FAULT_SKIP_INSN) {
> +		u8 insn_buf[MAX_INSN_SIZE];
> +		struct insn insn;
> +		size_t len;
> +
> +		len = sizeof(insn_buf) - copy_from_user(insn_buf, (void *)regs->ip, sizeof(insn_buf));
> +		if (!len)
> +			return;
> +
> +		if (insn_decode(&insn, insn_buf, len, in_32bit_syscall() ? INSN_MODE_32 : INSN_MODE_64) < 0)
> +			return;
> +
> +		regs->ip += insn.length;
> +		return;
> +	}

I just found umip.c, which does basically the same thing, but does it
correctly. For v+1, the above snippet will instead do this:

	if (fault & VM_FAULT_SKIP_INSN) {
		u8 buf[MAX_INSN_SIZE];
		struct insn insn;
		int nr_copied;

		nr_copied = insn_fetch_from_user(regs, buf);
		if (nr_copied <= 0)
			return;

		if (!insn_decode_from_regs(&insn, regs, buf, nr_copied))
			return;

		regs->ip += insn.length;
		return;
	}

Same thing, but those helpers do correct inspection of the environment
and registers.

Also, seeing this already being done in umip.c is heartening that the
approach here isn't overly insane.

Jason
diff mbox series

Patch

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7b0d4ab894c8..e5328073f8e0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -33,6 +33,8 @@ 
 #include <asm/kvm_para.h>		/* kvm_handle_async_pf		*/
 #include <asm/vdso.h>			/* fixup_vdso_exception()	*/
 #include <asm/irq_stack.h>
+#include <asm/insn.h>			/* insn_decode()		*/
+#include <asm/compat.h>			/* in_32bit_syscall()		*/
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -1454,6 +1456,23 @@  void do_user_addr_fault(struct pt_regs *regs,
 	}
 
 	mmap_read_unlock(mm);
+
+	if (fault & VM_FAULT_SKIP_INSN) {
+		u8 insn_buf[MAX_INSN_SIZE];
+		struct insn insn;
+		size_t len;
+
+		len = sizeof(insn_buf) - copy_from_user(insn_buf, (void *)regs->ip, sizeof(insn_buf));
+		if (!len)
+			return;
+
+		if (insn_decode(&insn, insn_buf, len, in_32bit_syscall() ? INSN_MODE_32 : INSN_MODE_64) < 0)
+			return;
+
+		regs->ip += insn.length;
+		return;
+	}
+
 	if (likely(!(fault & VM_FAULT_ERROR)))
 		return;
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 500e536796ca..4def1051499b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -861,6 +861,7 @@  typedef __bitwise unsigned int vm_fault_t;
  *				fsync() to complete (for synchronous page faults
  *				in DAX)
  * @VM_FAULT_COMPLETED:		->fault completed, meanwhile mmap lock released
+ * @VM_FAULT_SKIP_INSN:		->handle the fault by skipping faulting instruction
  * @VM_FAULT_HINDEX_MASK:	mask HINDEX value
  *
  */
@@ -879,6 +880,7 @@  enum vm_fault_reason {
 	VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
 	VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
 	VM_FAULT_COMPLETED      = (__force vm_fault_t)0x004000,
+	VM_FAULT_SKIP_INSN      = (__force vm_fault_t)0x008000,
 	VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
 };
 
@@ -903,7 +905,8 @@  enum vm_fault_reason {
 	{ VM_FAULT_RETRY,               "RETRY" },	\
 	{ VM_FAULT_FALLBACK,            "FALLBACK" },	\
 	{ VM_FAULT_DONE_COW,            "DONE_COW" },	\
-	{ VM_FAULT_NEEDDSYNC,           "NEEDDSYNC" }
+	{ VM_FAULT_NEEDDSYNC,           "NEEDDSYNC" },	\
+	{ VM_FAULT_SKIP_INSN,		"SKIP_INSN" }
 
 struct vm_special_mapping {
 	const char *name;	/* The name, e.g. "[vdso]". */
diff --git a/mm/memory.c b/mm/memory.c
index 72403585e1a5..8834a7c1580f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5220,8 +5220,10 @@  vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	lru_gen_exit_fault();
 
 	/* If the mapping is droppable, then errors due to OOM aren't fatal. */
-	if (vma->vm_flags & VM_DROPPABLE)
+	if ((ret & VM_FAULT_OOM) && (vma->vm_flags & VM_DROPPABLE)) {
 		ret &= ~VM_FAULT_OOM;
+		ret |= VM_FAULT_SKIP_INSN;
+	}
 
 	if (flags & FAULT_FLAG_USER) {
 		mem_cgroup_exit_user_fault();