diff mbox series

[16/17] prmem: pratomic-long

Message ID 20181023213504.28905-17-igor.stoppa@huawei.com (mailing list archive)
State New, archived
Headers show
Series prmem: protected memory | expand

Commit Message

Igor Stoppa Oct. 23, 2018, 9:35 p.m. UTC
Minimalistic functionality for having the write rare version of
atomic_long_t data.

Signed-off-by: Igor Stoppa <igor.stoppa@huawei.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Arnd Bergmann <arnd@arndb.de>
CC: linux-arch@vger.kernel.org
CC: linux-kernel@vger.kernel.org
---
 MAINTAINERS                   |  1 +
 include/linux/pratomic-long.h | 73 +++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 include/linux/pratomic-long.h

Comments

Peter Zijlstra Oct. 25, 2018, 12:13 a.m. UTC | #1
On Wed, Oct 24, 2018 at 12:35:03AM +0300, Igor Stoppa wrote:
> +static __always_inline
> +bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
> +{
> +	struct page *page;
> +	uintptr_t base;
> +	uintptr_t offset;
> +	unsigned long flags;
> +	size_t size = sizeof(*l);
> +	bool is_virt = __is_wr_after_init(l, size);
> +
> +	if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
> +		 WR_ERR_RANGE_MSG))
> +		return false;
> +	local_irq_save(flags);
> +	if (is_virt)
> +		page = virt_to_page(l);
> +	else
> +		vmalloc_to_page(l);
> +	offset = (~PAGE_MASK) & (uintptr_t)l;
> +	base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
> +	if (WARN(!base, WR_ERR_PAGE_MSG)) {
> +		local_irq_restore(flags);
> +		return false;
> +	}
> +	if (inc)
> +		atomic_long_inc((atomic_long_t *)(base + offset));
> +	else
> +		atomic_long_dec((atomic_long_t *)(base + offset));
> +	vunmap((void *)base);
> +	local_irq_restore(flags);
> +	return true;
> +
> +}

That's just hideously nasty.. and horribly broken.

We're not going to duplicate all these kernel interfaces wrapped in gunk
like that. Also, you _cannot_ call vunmap() with IRQs disabled. Clearly
you've never tested this with debug bits enabled.
Igor Stoppa Oct. 29, 2018, 9:17 p.m. UTC | #2
On 25/10/2018 01:13, Peter Zijlstra wrote:
> On Wed, Oct 24, 2018 at 12:35:03AM +0300, Igor Stoppa wrote:
>> +static __always_inline
>> +bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
>> +{
>> +	struct page *page;
>> +	uintptr_t base;
>> +	uintptr_t offset;
>> +	unsigned long flags;
>> +	size_t size = sizeof(*l);
>> +	bool is_virt = __is_wr_after_init(l, size);
>> +
>> +	if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
>> +		 WR_ERR_RANGE_MSG))
>> +		return false;
>> +	local_irq_save(flags);
>> +	if (is_virt)
>> +		page = virt_to_page(l);
>> +	else
>> +		vmalloc_to_page(l);
>> +	offset = (~PAGE_MASK) & (uintptr_t)l;
>> +	base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
>> +	if (WARN(!base, WR_ERR_PAGE_MSG)) {
>> +		local_irq_restore(flags);
>> +		return false;
>> +	}
>> +	if (inc)
>> +		atomic_long_inc((atomic_long_t *)(base + offset));
>> +	else
>> +		atomic_long_dec((atomic_long_t *)(base + offset));
>> +	vunmap((void *)base);
>> +	local_irq_restore(flags);
>> +	return true;
>> +
>> +}
> 
> That's just hideously nasty.. and horribly broken.
> 
> We're not going to duplicate all these kernel interfaces wrapped in gunk
> like that. 

one possibility would be to have macros which use typeof() on the 
parameter being passed, to decide what implementation to use: regular or 
write-rare

This means that type punning would still be needed, to select the 
implementation.

Would this be enough? Is there some better way?

> Also, you _cannot_ call vunmap() with IRQs disabled. Clearly
> you've never tested this with debug bits enabled.

I thought I had them. And I _did_ have them enabled, at some point.
But I must have messed up with the configuration and I failed to notice 
this.

I can think of a way it might work, albeit it's not going to be very pretty:

* for the vmap(): if I understand correctly, it might sleep while 
obtaining memory for creating the mapping. This part could be executed 
before disabling interrupts. The rest of the function, instead, would be 
executed after interrupts are disabled.

* for vunmap(): after the writing is done, change also the alternate 
mapping to read only, then enable interrupts and destroy the alternate 
mapping. Making also the secondary mapping read only makes it equally 
secure as the primary, which means that it can be visible also with 
interrupts enabled.


--
igor
Peter Zijlstra Oct. 30, 2018, 3:58 p.m. UTC | #3
On Mon, Oct 29, 2018 at 11:17:14PM +0200, Igor Stoppa wrote:
> 
> 
> On 25/10/2018 01:13, Peter Zijlstra wrote:
> > On Wed, Oct 24, 2018 at 12:35:03AM +0300, Igor Stoppa wrote:
> > > +static __always_inline
> > > +bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
> > > +{
> > > +	struct page *page;
> > > +	uintptr_t base;
> > > +	uintptr_t offset;
> > > +	unsigned long flags;
> > > +	size_t size = sizeof(*l);
> > > +	bool is_virt = __is_wr_after_init(l, size);
> > > +
> > > +	if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
> > > +		 WR_ERR_RANGE_MSG))
> > > +		return false;
> > > +	local_irq_save(flags);
> > > +	if (is_virt)
> > > +		page = virt_to_page(l);
> > > +	else
> > > +		vmalloc_to_page(l);
> > > +	offset = (~PAGE_MASK) & (uintptr_t)l;
> > > +	base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
> > > +	if (WARN(!base, WR_ERR_PAGE_MSG)) {
> > > +		local_irq_restore(flags);
> > > +		return false;
> > > +	}
> > > +	if (inc)
> > > +		atomic_long_inc((atomic_long_t *)(base + offset));
> > > +	else
> > > +		atomic_long_dec((atomic_long_t *)(base + offset));
> > > +	vunmap((void *)base);
> > > +	local_irq_restore(flags);
> > > +	return true;
> > > +
> > > +}
> > 
> > That's just hideously nasty.. and horribly broken.
> > 
> > We're not going to duplicate all these kernel interfaces wrapped in gunk
> > like that.
> 
> one possibility would be to have macros which use typeof() on the parameter
> being passed, to decide what implementation to use: regular or write-rare
> 
> This means that type punning would still be needed, to select the
> implementation.
> 
> Would this be enough? Is there some better way?

Like mentioned elsewhere; if you do write_enable() + write_disable()
thingies, it all becomes:

	write_enable();
	atomic_foo(&bar);
	write_disable();

No magic gunk infested duplication at all. Of course, ideally you'd then
teach objtool about this (or a GCC plugin I suppose) to ensure any
enable reached a disable.

The alternative is something like:

#define ALLOW_WRITE(stmt) do { write_enable(); do { stmt; } while (0); write_disable(); } while (0)

which then allows you to write:

	ALLOW_WRITE(atomic_foo(&bar));

No duplication.

> > Also, you _cannot_ call vunmap() with IRQs disabled. Clearly
> > you've never tested this with debug bits enabled.
> 
> I thought I had them. And I _did_ have them enabled, at some point.
> But I must have messed up with the configuration and I failed to notice
> this.
> 
> I can think of a way it might work, albeit it's not going to be very pretty:
> 
> * for the vmap(): if I understand correctly, it might sleep while obtaining
> memory for creating the mapping. This part could be executed before
> disabling interrupts. The rest of the function, instead, would be executed
> after interrupts are disabled.
> 
> * for vunmap(): after the writing is done, change also the alternate mapping
> to read only, then enable interrupts and destroy the alternate mapping.
> Making also the secondary mapping read only makes it equally secure as the
> primary, which means that it can be visible also with interrupts enabled.

That doesn't work if you wanted to do this write while you already have
IRQs disabled for example.
Will Deacon Oct. 30, 2018, 4:28 p.m. UTC | #4
On Tue, Oct 30, 2018 at 04:58:41PM +0100, Peter Zijlstra wrote:
> On Mon, Oct 29, 2018 at 11:17:14PM +0200, Igor Stoppa wrote:
> > 
> > 
> > On 25/10/2018 01:13, Peter Zijlstra wrote:
> > > On Wed, Oct 24, 2018 at 12:35:03AM +0300, Igor Stoppa wrote:
> > > > +static __always_inline
> > > > +bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
> > > > +{
> > > > +	struct page *page;
> > > > +	uintptr_t base;
> > > > +	uintptr_t offset;
> > > > +	unsigned long flags;
> > > > +	size_t size = sizeof(*l);
> > > > +	bool is_virt = __is_wr_after_init(l, size);
> > > > +
> > > > +	if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
> > > > +		 WR_ERR_RANGE_MSG))
> > > > +		return false;
> > > > +	local_irq_save(flags);
> > > > +	if (is_virt)
> > > > +		page = virt_to_page(l);
> > > > +	else
> > > > +		vmalloc_to_page(l);
> > > > +	offset = (~PAGE_MASK) & (uintptr_t)l;
> > > > +	base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
> > > > +	if (WARN(!base, WR_ERR_PAGE_MSG)) {
> > > > +		local_irq_restore(flags);
> > > > +		return false;
> > > > +	}
> > > > +	if (inc)
> > > > +		atomic_long_inc((atomic_long_t *)(base + offset));
> > > > +	else
> > > > +		atomic_long_dec((atomic_long_t *)(base + offset));
> > > > +	vunmap((void *)base);
> > > > +	local_irq_restore(flags);
> > > > +	return true;
> > > > +
> > > > +}
> > > 
> > > That's just hideously nasty.. and horribly broken.
> > > 
> > > We're not going to duplicate all these kernel interfaces wrapped in gunk
> > > like that.
> > 
> > one possibility would be to have macros which use typeof() on the parameter
> > being passed, to decide what implementation to use: regular or write-rare
> > 
> > This means that type punning would still be needed, to select the
> > implementation.
> > 
> > Would this be enough? Is there some better way?
> 
> Like mentioned elsewhere; if you do write_enable() + write_disable()
> thingies, it all becomes:
> 
> 	write_enable();
> 	atomic_foo(&bar);
> 	write_disable();
> 
> No magic gunk infested duplication at all. Of course, ideally you'd then
> teach objtool about this (or a GCC plugin I suppose) to ensure any
> enable reached a disable.

Isn't the issue here that we don't want to change the page tables for the
mapping of &bar, but instead want to create a temporary writable alias
at a random virtual address?

So you'd want:

	wbar = write_enable(&bar);
	atomic_foo(wbar);
	write_disable(wbar);

which is probably better expressed as a map/unmap API. I suspect this
would also be the only way to do things for cmpxchg() loops, where you
want to create the mapping outside of the loop to minimise your time in
the critical section.

Will
Peter Zijlstra Oct. 31, 2018, 9:10 a.m. UTC | #5
On Tue, Oct 30, 2018 at 04:28:16PM +0000, Will Deacon wrote:
> On Tue, Oct 30, 2018 at 04:58:41PM +0100, Peter Zijlstra wrote:
> > Like mentioned elsewhere; if you do write_enable() + write_disable()
> > thingies, it all becomes:
> > 
> > 	write_enable();
> > 	atomic_foo(&bar);
> > 	write_disable();
> > 
> > No magic gunk infested duplication at all. Of course, ideally you'd then
> > teach objtool about this (or a GCC plugin I suppose) to ensure any
> > enable reached a disable.
> 
> Isn't the issue here that we don't want to change the page tables for the
> mapping of &bar, but instead want to create a temporary writable alias
> at a random virtual address?
> 
> So you'd want:
> 
> 	wbar = write_enable(&bar);
> 	atomic_foo(wbar);
> 	write_disable(wbar);
> 
> which is probably better expressed as a map/unmap API. I suspect this
> would also be the only way to do things for cmpxchg() loops, where you
> want to create the mapping outside of the loop to minimise your time in
> the critical section.

Ah, so I was thikning that the altnerative mm would have stuff in the
same location, just RW instead of RO.

But yes, if we, like Andy suggets, use the userspace address range for
the aliases, then we need to do as you suggest.
Kees Cook Nov. 1, 2018, 3:28 a.m. UTC | #6
On Wed, Oct 31, 2018 at 2:10 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Oct 30, 2018 at 04:28:16PM +0000, Will Deacon wrote:
>> On Tue, Oct 30, 2018 at 04:58:41PM +0100, Peter Zijlstra wrote:
>> > Like mentioned elsewhere; if you do write_enable() + write_disable()
>> > thingies, it all becomes:
>> >
>> >     write_enable();
>> >     atomic_foo(&bar);
>> >     write_disable();
>> >
>> > No magic gunk infested duplication at all. Of course, ideally you'd then
>> > teach objtool about this (or a GCC plugin I suppose) to ensure any
>> > enable reached a disable.
>>
>> Isn't the issue here that we don't want to change the page tables for the
>> mapping of &bar, but instead want to create a temporary writable alias
>> at a random virtual address?
>>
>> So you'd want:
>>
>>       wbar = write_enable(&bar);
>>       atomic_foo(wbar);
>>       write_disable(wbar);
>>
>> which is probably better expressed as a map/unmap API. I suspect this
>> would also be the only way to do things for cmpxchg() loops, where you
>> want to create the mapping outside of the loop to minimise your time in
>> the critical section.
>
> Ah, so I was thikning that the altnerative mm would have stuff in the
> same location, just RW instead of RO.

I was hoping for the same location too. That allows use to use a gcc
plugin to mark, say, function pointer tables, as read-only, and
annotate their rare updates with write_rare() without any
recalculation.

-Kees
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index e7f7cb1682a6..9d72688d00a3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9466,6 +9466,7 @@  F:	mm/test_pmalloc.c
 F:	Documentation/core-api/prmem.rst
 F:	include/linux/prlist.h
 F:	lib/test_prlist.c
+F:	include/linux/pratomic-long.h
 
 MEMORY MANAGEMENT
 L:	linux-mm@kvack.org
diff --git a/include/linux/pratomic-long.h b/include/linux/pratomic-long.h
new file mode 100644
index 000000000000..8f1408593733
--- /dev/null
+++ b/include/linux/pratomic-long.h
@@ -0,0 +1,73 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Atomic operations for write rare memory */
+#ifndef _LINUX_PRATOMIC_LONG_H
+#define _LINUX_PRATOMIC_LONG_H
+#include <linux/prmem.h>
+#include <linux/compiler.h>
+#include <asm-generic/atomic-long.h>
+
+struct pratomic_long_t {
+	atomic_long_t l __aligned(sizeof(atomic_long_t));
+} __aligned(sizeof(atomic_long_t));
+
+#define PRATOMIC_LONG_INIT(i)	{	\
+	.l = ATOMIC_LONG_INIT((i)),	\
+}
+
+static __always_inline
+bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
+{
+	struct page *page;
+	uintptr_t base;
+	uintptr_t offset;
+	unsigned long flags;
+	size_t size = sizeof(*l);
+	bool is_virt = __is_wr_after_init(l, size);
+
+	if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
+		 WR_ERR_RANGE_MSG))
+		return false;
+	local_irq_save(flags);
+	if (is_virt)
+		page = virt_to_page(l);
+	else
+		vmalloc_to_page(l);
+	offset = (~PAGE_MASK) & (uintptr_t)l;
+	base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
+	if (WARN(!base, WR_ERR_PAGE_MSG)) {
+		local_irq_restore(flags);
+		return false;
+	}
+	if (inc)
+		atomic_long_inc((atomic_long_t *)(base + offset));
+	else
+		atomic_long_dec((atomic_long_t *)(base + offset));
+	vunmap((void *)base);
+	local_irq_restore(flags);
+	return true;
+
+}
+
+/**
+ * pratomic_long_inc - atomic increment of rare write long
+ * @l: address of the variable of type struct pratomic_long_t
+ *
+ * Return: true on success, false otherwise
+ */
+static __always_inline bool pratomic_long_inc(struct pratomic_long_t *l)
+{
+	return __pratomic_long_op(true, l);
+}
+
+/**
+ * pratomic_long_inc - atomic decrement of rare write long
+ * @l: address of the variable of type struct pratomic_long_t
+ *
+ * Return: true on success, false otherwise
+ */
+static __always_inline bool pratomic_long_dec(struct pratomic_long_t *l)
+{
+	return __pratomic_long_op(false, l);
+}
+
+#endif