diff mbox series

[01/12] x86_64: memset_user()

Message ID 20181221181423.20455-2-igor.stoppa@huawei.com (mailing list archive)
State New, archived
Headers show
Series [01/12] x86_64: memset_user() | expand

Commit Message

Igor Stoppa Dec. 21, 2018, 6:14 p.m. UTC
Create x86_64 specific version of memset for user space, based on
clear_user().
This will be used for implementing wr_memset() in the __wr_after_init
scenario, where write-rare variables have an alternate mapping for
writing.

Signed-off-by: Igor Stoppa <igor.stoppa@huawei.com>

CC: Andy Lutomirski <luto@amacapital.net>
CC: Nadav Amit <nadav.amit@gmail.com>
CC: Matthew Wilcox <willy@infradead.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Kees Cook <keescook@chromium.org>
CC: Dave Hansen <dave.hansen@linux.intel.com>
CC: Mimi Zohar <zohar@linux.vnet.ibm.com>
CC: Thiago Jung Bauermann <bauerman@linux.ibm.com>
CC: Ahmed Soliman <ahmedsoliman@mena.vt.edu>
CC: linux-integrity@vger.kernel.org
CC: kernel-hardening@lists.openwall.com
CC: linux-mm@kvack.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/uaccess_64.h |  6 ++++
 arch/x86/lib/usercopy_64.c        | 54 +++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

Comments

Matthew Wilcox Dec. 21, 2018, 6:25 p.m. UTC | #1
On Fri, Dec 21, 2018 at 08:14:12PM +0200, Igor Stoppa wrote:
> +unsigned long __memset_user(void __user *addr, int c, unsigned long size)
> +{
> +	long __d0;
> +	unsigned long  pattern = 0;
> +	int i;
> +
> +	for (i = 0; i < 8; i++)
> +		pattern = (pattern << 8) | (0xFF & c);

That's inefficient.

	pattern = (unsigned char)c;
	pattern |= pattern << 8;
	pattern |= pattern << 16;
	pattern |= pattern << 32;
Igor Stoppa Dec. 21, 2018, 6:46 p.m. UTC | #2
On 21/12/2018 20:25, Matthew Wilcox wrote:
> On Fri, Dec 21, 2018 at 08:14:12PM +0200, Igor Stoppa wrote:
>> +unsigned long __memset_user(void __user *addr, int c, unsigned long size)
>> +{
>> +	long __d0;
>> +	unsigned long  pattern = 0;
>> +	int i;
>> +
>> +	for (i = 0; i < 8; i++)
>> +		pattern = (pattern << 8) | (0xFF & c);
> 
> That's inefficient.
> 
> 	pattern = (unsigned char)c;
> 	pattern |= pattern << 8;
> 	pattern |= pattern << 16;
> 	pattern |= pattern << 32;

ok, thank you

--
igor
Cyrill Gorcunov Dec. 21, 2018, 8:05 p.m. UTC | #3
On Fri, Dec 21, 2018 at 10:25:16AM -0800, Matthew Wilcox wrote:
> On Fri, Dec 21, 2018 at 08:14:12PM +0200, Igor Stoppa wrote:
> > +unsigned long __memset_user(void __user *addr, int c, unsigned long size)
> > +{
> > +	long __d0;
> > +	unsigned long  pattern = 0;
> > +	int i;
> > +
> > +	for (i = 0; i < 8; i++)
> > +		pattern = (pattern << 8) | (0xFF & c);
> 
> That's inefficient.
> 
> 	pattern = (unsigned char)c;
> 	pattern |= pattern << 8;
> 	pattern |= pattern << 16;
> 	pattern |= pattern << 32;

Won't

	pattern = 0x0101010101010101 * c;

do the same but faster?
Matthew Wilcox Dec. 21, 2018, 8:29 p.m. UTC | #4
On Fri, Dec 21, 2018 at 11:05:46PM +0300, Cyrill Gorcunov wrote:
> On Fri, Dec 21, 2018 at 10:25:16AM -0800, Matthew Wilcox wrote:
> > On Fri, Dec 21, 2018 at 08:14:12PM +0200, Igor Stoppa wrote:
> > > +unsigned long __memset_user(void __user *addr, int c, unsigned long size)
> > > +{
> > > +	long __d0;
> > > +	unsigned long  pattern = 0;
> > > +	int i;
> > > +
> > > +	for (i = 0; i < 8; i++)
> > > +		pattern = (pattern << 8) | (0xFF & c);
> > 
> > That's inefficient.
> > 
> > 	pattern = (unsigned char)c;
> > 	pattern |= pattern << 8;
> > 	pattern |= pattern << 16;
> > 	pattern |= pattern << 32;
> 
> Won't
> 
> 	pattern = 0x0101010101010101 * c;
> 
> do the same but faster?

Depends on your CPU.  Some yes, some no.

(Also you need to cast 'c' to unsigned char to avoid someone passing in
0x1234 and getting 0x4646464646464634 instead of 0x3434343434343434)
Cyrill Gorcunov Dec. 21, 2018, 8:46 p.m. UTC | #5
On Fri, Dec 21, 2018 at 12:29:46PM -0800, Matthew Wilcox wrote:
> > > 
> > > That's inefficient.
> > > 
> > > 	pattern = (unsigned char)c;
> > > 	pattern |= pattern << 8;
> > > 	pattern |= pattern << 16;
> > > 	pattern |= pattern << 32;
> > 
> > Won't
> > 
> > 	pattern = 0x0101010101010101 * c;
> > 
> > do the same but faster?
> 
> Depends on your CPU.  Some yes, some no.
> 
> (Also you need to cast 'c' to unsigned char to avoid someone passing in
> 0x1234 and getting 0x4646464646464634 instead of 0x3434343434343434)

Cast to unsigned char is needed in any case. And as far as I remember
we've been using this multiplication trick for a really long time
in x86 land. I'm out of sources right now but it should be somewhere
in assembly libs.
Matthew Wilcox Dec. 21, 2018, 9:07 p.m. UTC | #6
On Fri, Dec 21, 2018 at 11:46:16PM +0300, Cyrill Gorcunov wrote:
> Cast to unsigned char is needed in any case. And as far as I remember
> we've been using this multiplication trick for a really long time
> in x86 land. I'm out of sources right now but it should be somewhere
> in assembly libs.

x86 isn't the only CPU.  Some CPUs have slow multiplies but fast shifts.
Also loading 0x0101010101010101 into a register may be inefficient on
some CPUs.
Cyrill Gorcunov Dec. 21, 2018, 9:17 p.m. UTC | #7
On Fri, Dec 21, 2018 at 01:07:21PM -0800, Matthew Wilcox wrote:
> On Fri, Dec 21, 2018 at 11:46:16PM +0300, Cyrill Gorcunov wrote:
> > Cast to unsigned char is needed in any case. And as far as I remember
> > we've been using this multiplication trick for a really long time
> > in x86 land. I'm out of sources right now but it should be somewhere
> > in assembly libs.
> 
> x86 isn't the only CPU.  Some CPUs have slow multiplies but fast shifts.

This is x86-64 patch, not some generic code.

> Also loading 0x0101010101010101 into a register may be inefficient on
> some CPUs.

It is pretty efficient on x86-64. Moreover the self dependents as
a |= a << b is a source for data hazards inside cpu engine. Anyway
i'm not going to insist, just wanted to remind about such trick.
Up to you what to choose.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index a9d637bc301d..f194bfce4866 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -213,4 +213,10 @@  copy_user_handle_tail(char *to, char *from, unsigned len);
 unsigned long
 mcsafe_handle_tail(char *to, char *from, unsigned len);
 
+unsigned long __must_check
+memset_user(void __user *mem, int c, unsigned long len);
+
+unsigned long __must_check
+__memset_user(void __user *mem, int c, unsigned long len);
+
 #endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 1bd837cdc4b1..84f8f8a20b30 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -9,6 +9,60 @@ 
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 
+/*
+ * Memset Userspace
+ */
+
+unsigned long __memset_user(void __user *addr, int c, unsigned long size)
+{
+	long __d0;
+	unsigned long  pattern = 0;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		pattern = (pattern << 8) | (0xFF & c);
+	might_fault();
+	/* no memory constraint: gcc doesn't know about this memory */
+	stac();
+	asm volatile(
+		"       movq %[val], %%rdx\n"
+		"	testq  %[size8],%[size8]\n"
+		"	jz     4f\n"
+		"0:	mov %%rdx,(%[dst])\n"
+		"	addq   $8,%[dst]\n"
+		"	decl %%ecx ; jnz   0b\n"
+		"4:	movq  %[size1],%%rcx\n"
+		"	testl %%ecx,%%ecx\n"
+		"	jz     2f\n"
+		"1:	movb   %%dl,(%[dst])\n"
+		"	incq   %[dst]\n"
+		"	decl %%ecx ; jnz  1b\n"
+		"2:\n"
+		".section .fixup,\"ax\"\n"
+		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"
+		"	jmp 2b\n"
+		".previous\n"
+		_ASM_EXTABLE_UA(0b, 3b)
+		_ASM_EXTABLE_UA(1b, 2b)
+		: [size8] "=&c"(size), [dst] "=&D" (__d0)
+		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
+		  [val] "ri"(pattern)
+		: "rdx");
+
+	clac();
+	return size;
+}
+EXPORT_SYMBOL(__memset_user);
+
+unsigned long memset_user(void __user *to, int c, unsigned long n)
+{
+	if (access_ok(VERIFY_WRITE, to, n))
+		return __memset_user(to, c, n);
+	return n;
+}
+EXPORT_SYMBOL(memset_user);
+
+
 /*
  * Zero Userspace
  */