diff mbox series

[v2,1/9] s390/uaccess: Add storage key checked cmpxchg access to user space

Message ID 20221012205609.2811294-2-scgl@linux.ibm.com (mailing list archive)
State New
Headers show
Series KVM: s390: Extend MEM_OP ioctl by storage key checked cmpxchg | expand

Commit Message

Janis Schoetterl-Glausch Oct. 12, 2022, 8:56 p.m. UTC
Add cmpxchg functionality similar to that in cmpxchg.h except that the
target is a user space address and that the address' storage key is
matched with the access_key argument in order to honor key-controlled
protection.
The access is performed by changing to the secondary-spaces mode and
setting the PSW key for the duration of the compare and swap.

Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
---


Possible variations:
  * check the assumptions made in cmpxchg_user_key_size and error out
  * call functions called by copy_to_user
     * access_ok? is a nop
     * should_fail_usercopy?
     * instrument_copy_to_user? doesn't make sense IMO
  * don't be overly strict in cmpxchg_user_key


 arch/s390/include/asm/uaccess.h | 189 ++++++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)

Comments

Heiko Carstens Oct. 20, 2022, 11:18 a.m. UTC | #1
Hi Janis,

On Wed, Oct 12, 2022 at 10:56:01PM +0200, Janis Schoetterl-Glausch wrote:
> Add cmpxchg functionality similar to that in cmpxchg.h except that the
> target is a user space address and that the address' storage key is
> matched with the access_key argument in order to honor key-controlled
> protection.
> The access is performed by changing to the secondary-spaces mode and
> setting the PSW key for the duration of the compare and swap.
> 
> Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
> ---
> 
> 
> Possible variations:
>   * check the assumptions made in cmpxchg_user_key_size and error out
>   * call functions called by copy_to_user
>      * access_ok? is a nop
>      * should_fail_usercopy?
>      * instrument_copy_to_user? doesn't make sense IMO
>   * don't be overly strict in cmpxchg_user_key
> 
> 
>  arch/s390/include/asm/uaccess.h | 189 ++++++++++++++++++++++++++++++++
>  1 file changed, 189 insertions(+)

This not how I would have expected something that would be
symmetrical/consistent with what we already have. However instead of
spending several iterations I'll send something for this.

This might take a bit due to my limited time. So please be patient.
Nico Boehr Oct. 20, 2022, 1:40 p.m. UTC | #2
Quoting Janis Schoetterl-Glausch (2022-10-12 22:56:01)
[...]
> diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
> index f7038b800cc3..f148f5a22c93 100644
[...]
> +static __always_inline int __cmpxchg_user_key_small(int size, u64 address,
> +                                                   unsigned __int128 *old_p,
> +                                                   unsigned __int128 new, u8 access_key)
> +{

This function is quite hard to understand for me without some context. I have a
few suggestions for some comments and one small question below.

> +       u32 shift, mask, old_word, new_word, align_mask, tmp;
> +       u64 aligned;
> +       int ret = -EFAULT;
> +

something like this:

/*
 * There is no instruction for 2 and 1 byte compare swap, hence emulate it with
 * a 4-byte compare swap.
 * When the 4-bytes compare swap fails, it can be because the actual value user
 * space wanted to exchange mismatched. In this case, return to user space.
 * Or it can be because something outside of the value user space wanted to
 * access mismatched (the "remainder" of the word). In this case, retry in
 * kernel space.
 */

> +       switch (size) {
> +       case 2:

/* assume address is 2-byte-aligned - cannot cross word boundary */

> +               align_mask = 2;

/* fancy way of saying aligned = address & ~align_mask */

> +               aligned = (address ^ (address & align_mask));

/* generate mask to extract value to xchg from the word */

> +               shift = (sizeof(u32) - (address & align_mask) - size) * 8;
> +               mask = 0xffff << shift;
> +               old_word = ((u16)*old_p) << shift;
> +               new_word = ((u16)new) << shift;
> +               break;
> +       case 1:
> +               align_mask = 3;
> +               aligned = (address ^ (address & align_mask));
> +               shift = (sizeof(u32) - (address & align_mask) - size) * 8;
> +               mask = 0xff << shift;
> +               old_word = ((u8)*old_p) << shift;
> +               new_word = ((u8)new) << shift;
> +               break;
> +       }
> +       tmp = old_word; /* don't modify *old_p on fault */
> +       asm volatile(
> +                      "spka    0(%[access_key])\n"

/* secondary space has user asce loaded */

> +               "       sacf    256\n"
> +               "0:     l       %[tmp],%[aligned]\n"
> +               "1:     nr      %[tmp],%[mask]\n"

/* invert mask to generate mask for the remainder */

> +               "       xilf    %[mask],0xffffffff\n"
> +               "       or      %[new_word],%[tmp]\n"
> +               "       or      %[tmp],%[old_word]\n"
> +               "2:     lr      %[old_word],%[tmp]\n"
> +               "3:     cs      %[tmp],%[new_word],%[aligned]\n"
> +               "4:     jnl     5f\n"
> +               /* We'll restore old_word before the cs, use reg for the diff */
> +               "       xr      %[old_word],%[tmp]\n"
> +               /* Apply diff assuming only bits outside target byte(s) changed */
> +               "       xr      %[new_word],%[old_word]\n"
> +               /* If prior assumption false we exit loop, so not an issue */
> +               "       nr      %[old_word],%[mask]\n"
> +               "       jz      2b\n"

So if the remainder changed but the actual value to exchange stays the same, we
loop in the kernel. Does it maybe make sense to limit the number of iterations
we spend retrying? I think while looping here the calling process can't be
killed, can it?
Heiko Carstens Oct. 21, 2022, 7:22 p.m. UTC | #3
On Thu, Oct 20, 2022 at 03:40:56PM +0200, Nico Boehr wrote:
> Quoting Janis Schoetterl-Glausch (2022-10-12 22:56:01)
> > +               "2:     lr      %[old_word],%[tmp]\n"
> > +               "3:     cs      %[tmp],%[new_word],%[aligned]\n"
> > +               "4:     jnl     5f\n"
> > +               /* We'll restore old_word before the cs, use reg for the diff */
> > +               "       xr      %[old_word],%[tmp]\n"
> > +               /* Apply diff assuming only bits outside target byte(s) changed */
> > +               "       xr      %[new_word],%[old_word]\n"
> > +               /* If prior assumption false we exit loop, so not an issue */
> > +               "       nr      %[old_word],%[mask]\n"
> > +               "       jz      2b\n"
> 
> So if the remainder changed but the actual value to exchange stays the same, we
> loop in the kernel. Does it maybe make sense to limit the number of iterations
> we spend retrying? I think while looping here the calling process can't be
> killed, can it?

Yes, the number of loops should be limited; quite similar what arm64
implemented with commit 03110a5cb216 ("arm64: futex: Bound number of
LDXR/STXR loops in FUTEX_WAKE_OP").
Heiko Carstens Nov. 2, 2022, 2:12 p.m. UTC | #4
Hi Janis,

On Wed, Oct 12, 2022 at 10:56:01PM +0200, Janis Schoetterl-Glausch wrote:
> Add cmpxchg functionality similar to that in cmpxchg.h except that the
> target is a user space address and that the address' storage key is
> matched with the access_key argument in order to honor key-controlled
> protection.
> The access is performed by changing to the secondary-spaces mode and
> setting the PSW key for the duration of the compare and swap.
> 
> Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
> ---
> 
> 
> Possible variations:
>   * check the assumptions made in cmpxchg_user_key_size and error out
>   * call functions called by copy_to_user
>      * access_ok? is a nop
>      * should_fail_usercopy?
>      * instrument_copy_to_user? doesn't make sense IMO
>   * don't be overly strict in cmpxchg_user_key
> 
> 
>  arch/s390/include/asm/uaccess.h | 189 ++++++++++++++++++++++++++++++++
>  1 file changed, 189 insertions(+)

So finally I send the uaccess/cmpxchg patches in reply to this mail.
Sorry for the long delay!

The first three patches are not required for the functionality you need,
but given that I always stress that the code should be consistent I include
them anyway.

The changes are probably quite obvious:

- Keep uaccess cmpxchg code more or less identical to regular cmpxchg
  code. I wasn't able to come up with a readable code base which could be
  used for both variants.

- Users may only use the cmpxchg_user_key() macro - _not_ the inline
  function, which is an internal API. This will require that you need to
  add a switch statement and couple of casts within the KVM code, but
  shouldn't have much of an impact on the generated code.

- Cause link error for non-integral sizes, similar to other uaccess
  functions.

- cmpxchg_user_key() has now a simple return value: 0 or -EFAULT, and
  writes the old value to a location provided by a pointer. This is quite
  similar to the futex code. Users must compare the old and expected value
  to figure out if something was exchanged. Note that this is in most cases
  more efficient than extracting the condition code from the PSW with ipm,
  since nowadays we have instructions like compare and branch relative on
  condition, etc.

- Couple of other minor changes which I forgot.

Code is untested (of course :) ). Please give it a try and let me know if
this is good enough for your purposes.

I also did not limit the number of retries for the one and two byte
scenarion. Before doing that we need to have proof that there really is a
problem. Maybe Nico or you will give this a try.

Thanks,
Heiko
Janis Schoetterl-Glausch Nov. 16, 2022, 7:36 p.m. UTC | #5
On Wed, 2022-11-02 at 15:12 +0100, Heiko Carstens wrote:
> 
[...]

> I also did not limit the number of retries for the one and two byte
> scenarion. Before doing that we need to have proof that there really is a
> problem. Maybe Nico or you will give this a try.

I wrote a memop selftest testcase where the main thread uses the one byte cmpxchg
while n vcpus flip adjacent bits. The time the test case runs increases superlinearly with n.
With 248 vcpus, 1000 one byte cmpxchgs take 25s.
I'm not sure how meaningful the test is since the worst case would be if the threads hammering
the word would run on a cpu dedicated to them.

In any case, why not err on the side of caution and limit the iterations?
I'll send an rfc patch.
> 
> Thanks,
> Heiko
Nico Boehr Nov. 17, 2022, 8:42 a.m. UTC | #6
Quoting Janis Schoetterl-Glausch (2022-11-16 20:36:46)
> On Wed, 2022-11-02 at 15:12 +0100, Heiko Carstens wrote:
> > 
> [...]
> 
> > I also did not limit the number of retries for the one and two byte
> > scenarion. Before doing that we need to have proof that there really is a
> > problem. Maybe Nico or you will give this a try.
> 
> I wrote a memop selftest testcase where the main thread uses the one byte cmpxchg
> while n vcpus flip adjacent bits. The time the test case runs increases superlinearly with n.
> With 248 vcpus, 1000 one byte cmpxchgs take 25s.
> I'm not sure how meaningful the test is since the worst case would be if the threads hammering
> the word would run on a cpu dedicated to them.
> 
> In any case, why not err on the side of caution and limit the iterations?
> I'll send an rfc patch.

I agree, limiting sounds like the safe choice.
diff mbox series

Patch

diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index f7038b800cc3..f148f5a22c93 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -19,6 +19,8 @@ 
 #include <asm/extable.h>
 #include <asm/facility.h>
 #include <asm-generic/access_ok.h>
+#include <asm/page.h>
+#include <linux/log2.h>
 
 void debug_user_asce(int exit);
 
@@ -390,4 +392,191 @@  do {									\
 		goto err_label;						\
 } while (0)
 
+static __always_inline int __cmpxchg_user_key_small(int size, u64 address,
+						    unsigned __int128 *old_p,
+						    unsigned __int128 new, u8 access_key)
+{
+	u32 shift, mask, old_word, new_word, align_mask, tmp;
+	u64 aligned;
+	int ret = -EFAULT;
+
+	switch (size) {
+	case 2:
+		align_mask = 2;
+		aligned = (address ^ (address & align_mask));
+		shift = (sizeof(u32) - (address & align_mask) - size) * 8;
+		mask = 0xffff << shift;
+		old_word = ((u16)*old_p) << shift;
+		new_word = ((u16)new) << shift;
+		break;
+	case 1:
+		align_mask = 3;
+		aligned = (address ^ (address & align_mask));
+		shift = (sizeof(u32) - (address & align_mask) - size) * 8;
+		mask = 0xff << shift;
+		old_word = ((u8)*old_p) << shift;
+		new_word = ((u8)new) << shift;
+		break;
+	}
+	tmp = old_word; /* don't modify *old_p on fault */
+	asm volatile(
+		       "spka	0(%[access_key])\n"
+		"	sacf	256\n"
+		"0:	l	%[tmp],%[aligned]\n"
+		"1:	nr	%[tmp],%[mask]\n"
+		"	xilf	%[mask],0xffffffff\n"
+		"	or	%[new_word],%[tmp]\n"
+		"	or	%[tmp],%[old_word]\n"
+		"2:	lr	%[old_word],%[tmp]\n"
+		"3:	cs	%[tmp],%[new_word],%[aligned]\n"
+		"4:	jnl	5f\n"
+		/* We'll restore old_word before the cs, use reg for the diff */
+		"	xr	%[old_word],%[tmp]\n"
+		/* Apply diff assuming only bits outside target byte(s) changed */
+		"	xr	%[new_word],%[old_word]\n"
+		/* If prior assumption false we exit loop, so not an issue */
+		"	nr	%[old_word],%[mask]\n"
+		"	jz	2b\n"
+		"5:	ipm	%[ret]\n"
+		"	srl	%[ret],28\n"
+		"6:	sacf	768\n"
+		"	spka	%[default_key]\n"
+		EX_TABLE(0b, 6b) EX_TABLE(1b, 6b)
+		EX_TABLE(3b, 6b) EX_TABLE(4b, 6b)
+		: [old_word] "+&d" (old_word),
+		  [new_word] "+&d" (new_word),
+		  [tmp] "+&d" (tmp),
+		  [aligned] "+Q" (*(u32 *)aligned),
+		  [ret] "+d" (ret)
+		: [access_key] "a" (access_key << 4),
+		  [mask] "d" (~mask),
+		  [default_key] "J" (PAGE_DEFAULT_KEY)
+		: "cc"
+	);
+	*old_p = (tmp & mask) >> shift;
+	return ret;
+}
+
+/**
+ * cmpxchg_user_key_size() - cmpxchg with user space target, honoring storage keys
+ * @size: Size of the value being cmpxchg'ed, one of 1,2,4,8,16.
+ * @address: User space address of value to compare to *@old_p and exchange with
+ *           @new. Must be aligned to @size.
+ * @old_p: Pointer to old value. Interpreted as a @size byte integer and compared
+ *         to the content pointed to by @address in order to determine if the
+ *         exchange occurs. The value read from @address is written back to *@old_p.
+ * @new: New value to place at @address, interpreted as a @size byte integer.
+ * @access_key: Access key to use for checking storage key protection.
+ *
+ * Perform a cmpxchg on a user space target, honoring storage key protection.
+ * @access_key alone determines how key checking is performed, neither
+ * storage-protection-override nor fetch-protection-override apply.
+ *
+ * Return:	0: successful exchange
+ *		1: exchange failed
+ *		-EFAULT: @address not accessible or not naturally aligned
+ *		-EINVAL: invalid @size
+ */
+static __always_inline int cmpxchg_user_key_size(int size, void __user *address,
+						 unsigned __int128 *old_p,
+						 unsigned __int128 new, u8 access_key)
+{
+	union {
+		u32 word;
+		u64 doubleword;
+	} old;
+	int ret = -EFAULT;
+
+	/*
+	 * The following assumes that:
+	 *  * the current psw key is the default key
+	 *  * no storage protection overrides are in effect
+	 */
+	might_fault();
+	switch (size) {
+	case 16:
+		asm volatile(
+			       "spka	0(%[access_key])\n"
+			"	sacf	256\n"
+			"0:	cdsg	%[old],%[new],%[target]\n"
+			"1:	ipm	%[ret]\n"
+			"	srl	%[ret],28\n"
+			"2:	sacf	768\n"
+			"	spka	%[default_key]\n"
+			EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
+			: [old] "+d" (*old_p),
+			  [target] "+Q" (*(unsigned __int128 __user *)address),
+			  [ret] "+d" (ret)
+			: [access_key] "a" (access_key << 4),
+			  [new] "d" (new),
+			  [default_key] "J" (PAGE_DEFAULT_KEY)
+			: "cc"
+		);
+		return ret;
+	case 8:
+		old.doubleword = *old_p;
+		asm volatile(
+			       "spka	0(%[access_key])\n"
+			"	sacf	256\n"
+			"0:	csg	%[old],%[new],%[target]\n"
+			"1:	ipm	%[ret]\n"
+			"	srl	%[ret],28\n"
+			"2:	sacf	768\n"
+			"	spka	%[default_key]\n"
+			EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
+			: [old] "+d" (old.doubleword),
+			  [target] "+Q" (*(u64 __user *)address),
+			  [ret] "+d" (ret)
+			: [access_key] "a" (access_key << 4),
+			  [new] "d" ((u64)new),
+			  [default_key] "J" (PAGE_DEFAULT_KEY)
+			: "cc"
+		);
+		*old_p = old.doubleword;
+		return ret;
+	case 4:
+		old.word = *old_p;
+		asm volatile(
+			       "spka	0(%[access_key])\n"
+			"	sacf	256\n"
+			"0:	cs	%[old],%[new],%[target]\n"
+			"1:	ipm	%[ret]\n"
+			"	srl	%[ret],28\n"
+			"2:	sacf	768\n"
+			"	spka	%[default_key]\n"
+			EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
+			: [old] "+d" (old.word),
+			  [target] "+Q" (*(u32 __user *)address),
+			  [ret] "+d" (ret)
+			: [access_key] "a" (access_key << 4),
+			  [new] "d" ((u32)new),
+			  [default_key] "J" (PAGE_DEFAULT_KEY)
+			: "cc"
+		);
+		*old_p = old.word;
+		return ret;
+	case 2:
+	case 1:
+		return __cmpxchg_user_key_small(size, (u64)address, old_p, new, access_key);
+	default:
+		return -EINVAL;
+	}
+}
+
+#define cmpxchg_user_key(target_p, old_p, new, access_key)			\
+({										\
+	__typeof__(old_p) __old_p = (old_p);					\
+	unsigned __int128 __old = *__old_p;					\
+	size_t __size = sizeof(*(target_p));					\
+	int __ret;								\
+										\
+	BUILD_BUG_ON(__size != sizeof(*__old_p));				\
+	BUILD_BUG_ON(__size != sizeof(new));					\
+	BUILD_BUG_ON(__size > 16 || !is_power_of_2(__size));			\
+	__ret = cmpxchg_user_key_size(__size, (target_p), &__old, (new),	\
+				      (access_key));				\
+	*__old_p = __old;							\
+	__ret;									\
+})
+
 #endif /* __S390_UACCESS_H */