[kvm-unit-tests,8/9] s390x: add vmalloc support
diff mbox

Message ID 20180110215348.315-9-david@redhat.com
State New
Headers show

Commit Message

David Hildenbrand Jan. 10, 2018, 9:53 p.m. UTC
To use virtual addresses, we have to
- build page tables with identity mapping
- setup the primary ASCE in cr1
- enable DAT in the PSW

Not using the Linux definititons/implementation as they contain too much
software defined stuff / things we don't need.

Written from scratch. Tried to stick to the general Linux naming
schemes.

As we currently don't invalidate anything except page table entries, it
is suficient to only use ipte for now.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 lib/s390x/asm/arch_def.h |  45 ++++++++++
 lib/s390x/asm/page.h     |  24 +++++
 lib/s390x/asm/pgtable.h  | 222 +++++++++++++++++++++++++++++++++++++++++++++++
 lib/s390x/mmu.c          |  90 +++++++++++++++++++
 lib/s390x/sclp.c         |   1 +
 s390x/Makefile           |   2 +
 s390x/cstart64.S         |   3 +-
 7 files changed, 386 insertions(+), 1 deletion(-)
 create mode 100644 lib/s390x/asm/pgtable.h
 create mode 100644 lib/s390x/mmu.c

Comments

Thomas Huth Jan. 11, 2018, 11:51 a.m. UTC | #1
On 10.01.2018 22:53, David Hildenbrand wrote:
> To use virtual addresses, we have to
> - build page tables with identity mapping
> - setup the primary ASCE in cr1
> - enable DAT in the PSW
> 
> Not using the Linux definititons/implementation as they contain too much

s/definititons/definitions/

> software defined stuff / things we don't need.
> 
> Written from scratch. Tried to stick to the general Linux naming
> schemes.
> 
> As we currently don't invalidate anything except page table entries, it
> is suficient to only use ipte for now.

s/suficient/sufficient/

> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  lib/s390x/asm/arch_def.h |  45 ++++++++++
>  lib/s390x/asm/page.h     |  24 +++++
>  lib/s390x/asm/pgtable.h  | 222 +++++++++++++++++++++++++++++++++++++++++++++++
>  lib/s390x/mmu.c          |  90 +++++++++++++++++++
>  lib/s390x/sclp.c         |   1 +
>  s390x/Makefile           |   2 +
>  s390x/cstart64.S         |   3 +-
>  7 files changed, 386 insertions(+), 1 deletion(-)
>  create mode 100644 lib/s390x/asm/pgtable.h
>  create mode 100644 lib/s390x/mmu.c
> 
> diff --git a/lib/s390x/asm/arch_def.h b/lib/s390x/asm/arch_def.h
> index ae27ab2..416dc75 100644
> --- a/lib/s390x/asm/arch_def.h
> +++ b/lib/s390x/asm/arch_def.h
> @@ -163,4 +163,49 @@ static inline int tprot(unsigned long addr)
>  	return cc;
>  }
>  
> +static inline void lctlg(int cr, uint64_t value)
> +{
> +	asm volatile(
> +		"	lctlg	%1,%1,%0\n"
> +		: : "Q" (value), "i" (cr));

Doesn't the compiler complain here? I though that "i" is only possible
with constants? I guess the compiler is smart enough to replace it with
the right constants, since this is an inline function. But maybe better
play safe and turn this into a macro instead.

> +}
> +
> +static inline uint64_t stctg(int cr)
> +{
> +	uint64_t value;
> +
> +	asm volatile(
> +		"	stctg	%1,%1,%0\n"
> +		: "=Q" (value) : "i" (cr) : "memory");

dito

> +	return value;
> +}
> +
> +static inline void configure_dat(int enable)
> +{
> +	struct psw psw = {
> +		.mask = 0,
> +		.addr = 0,
> +	};
> +
> +	asm volatile(
> +		/* fetch the old PSW masks */
> +		"	epsw	%1,%2\n"
> +		/* remove the DAT flag first */
> +		"	nilh	%1,0xfbff\n"
> +		"	or	%3,%3\n"
> +		"	jz	disable\n"
> +		/* set DAT flag if enable == true */
> +		"	oilh	%1,0x0400\n"
> +		"	st	%1,0(%0)\n"
> +		"disable:\n"

Shouldn't the "disable" label be placed before the "st" ?

> +		"	st	%2,4(%0)\n"
> +		/* load the target address for our new PSW */
> +		"	larl	%1,cont\n"
> +		"	stg	%1,8(%0)\n"
> +		"	lpswe	0(%0)\n"
> +		"cont:\n"
> +		: : "a" (&psw), "r" (0), "r" (1), "r" (enable)

The usage of "r" (0), "r" (1) looks wrong. I think you should rather put
them into the output list instead, since they are modified within the
assembler code. Or maybe even better, use fixed registers in the
assembler code and mark the corresponding registers in the clobber list.

> +		: "memory", "cc");
> +}

Why is this an inline function in this header? It seems to only be used
in mmu.c, so I'd suggest to move it there (or even in a separate
assembler file).

>  #endif
> diff --git a/lib/s390x/asm/page.h b/lib/s390x/asm/page.h
> index 141a456..e7cc16d 100644
> --- a/lib/s390x/asm/page.h
> +++ b/lib/s390x/asm/page.h
[...]

I'll review this part later...

> diff --git a/lib/s390x/sclp.c b/lib/s390x/sclp.c
> index c0492b8..522c387 100644
> --- a/lib/s390x/sclp.c
> +++ b/lib/s390x/sclp.c
> @@ -26,6 +26,7 @@ static uint64_t ram_size;
>  static void mem_init(phys_addr_t freemem_start, phys_addr_t mem_end)
>  {
>  	phys_alloc_init(freemem_start, ram_size - freemem_start);
> +	setup_vm();

Do we really want to enable VM uncoditionally for all tests? x86 also
calls setup_vm() only for some specific test. I think we should keep the
possibility to run some tests in real mode, too, shouldn't we?

>  }
>  

 Thomas
David Hildenbrand Jan. 11, 2018, 12:07 p.m. UTC | #2
>>  
>> +static inline void lctlg(int cr, uint64_t value)
>> +{
>> +	asm volatile(
>> +		"	lctlg	%1,%1,%0\n"
>> +		: : "Q" (value), "i" (cr));
> 
> Doesn't the compiler complain here? I though that "i" is only possible
> with constants? I guess the compiler is smart enough to replace it with
> the right constants, since this is an inline function. But maybe better
> play safe and turn this into a macro instead.

This works as it gets inlined. asm macros is frowned upon, no?

> 
>> +	return value;
>> +}
>> +
>> +static inline void configure_dat(int enable)
>> +{
>> +	struct psw psw = {
>> +		.mask = 0,
>> +		.addr = 0,
>> +	};
>> +
>> +	asm volatile(
>> +		/* fetch the old PSW masks */
>> +		"	epsw	%1,%2\n"
>> +		/* remove the DAT flag first */
>> +		"	nilh	%1,0xfbff\n"
>> +		"	or	%3,%3\n"
>> +		"	jz	disable\n"
>> +		/* set DAT flag if enable == true */
>> +		"	oilh	%1,0x0400\n"
>> +		"	st	%1,0(%0)\n"
>> +		"disable:\n"
> 
> Shouldn't the "disable" label be placed before the "st" ?

yes indeed
> 
>> +		"	st	%2,4(%0)\n"
>> +		/* load the target address for our new PSW */
>> +		"	larl	%1,cont\n"
>> +		"	stg	%1,8(%0)\n"
>> +		"	lpswe	0(%0)\n"
>> +		"cont:\n"
>> +		: : "a" (&psw), "r" (0), "r" (1), "r" (enable)
> 
> The usage of "r" (0), "r" (1) looks wrong. I think you should rather put
> them into the output list instead, since they are modified within the
> assembler code. Or maybe even better, use fixed registers in the
> assembler code and mark the corresponding registers in the clobber list.

I tried to use the output list way and it would not let me specify
immediates (0). So I have to introduce local variables. Or is there
another way?

> 
>> +		: "memory", "cc");
>> +}
> 
> Why is this an inline function in this header? It seems to only be used
> in mmu.c, so I'd suggest to move it there (or even in a separate
> assembler file).

I will move it to mmu.c but export the function, because ...

> 
>>  #endif
>> diff --git a/lib/s390x/asm/page.h b/lib/s390x/asm/page.h
>> index 141a456..e7cc16d 100644
>> --- a/lib/s390x/asm/page.h
>> +++ b/lib/s390x/asm/page.h
> [...]
> 
> I'll review this part later...
> 
>> diff --git a/lib/s390x/sclp.c b/lib/s390x/sclp.c
>> index c0492b8..522c387 100644
>> --- a/lib/s390x/sclp.c
>> +++ b/lib/s390x/sclp.c
>> @@ -26,6 +26,7 @@ static uint64_t ram_size;
>>  static void mem_init(phys_addr_t freemem_start, phys_addr_t mem_end)
>>  {
>>  	phys_alloc_init(freemem_start, ram_size - freemem_start);
>> +	setup_vm();
> 
> Do we really want to enable VM uncoditionally for all tests? x86 also
> calls setup_vm() only for some specific test. I think we should keep the
> possibility to run some tests in real mode, too, shouldn't we?

A future test framework will most likely rely on malloc to work (that's
why we are implementing it). So I enable it as default. Each test can
simply disable dat for a certain time and reenable it then via
configure_dat().

(will also use this in sieve.c to get physical memory access)

> 
>>  }
>>  
> 
>  Thomas
> 

Thanks!
Thomas Huth Jan. 11, 2018, 12:24 p.m. UTC | #3
On 11.01.2018 13:07, David Hildenbrand wrote:
> 
>>>  
>>> +static inline void lctlg(int cr, uint64_t value)
>>> +{
>>> +	asm volatile(
>>> +		"	lctlg	%1,%1,%0\n"
>>> +		: : "Q" (value), "i" (cr));
>>
>> Doesn't the compiler complain here? I though that "i" is only possible
>> with constants? I guess the compiler is smart enough to replace it with
>> the right constants, since this is an inline function. But maybe better
>> play safe and turn this into a macro instead.
> 
> This works as it gets inlined. asm macros is frowned upon, no?

I'm just afraid that it is specific to the compiler version whether this
works with an inline function or not. But ok, let's keep it this way
first, and if we later hit a compiler where this does not work, we still
can replace it with a macro instead.

>>> +	asm volatile(
>>> +		/* fetch the old PSW masks */
>>> +		"	epsw	%1,%2\n"
>>> +		/* remove the DAT flag first */
>>> +		"	nilh	%1,0xfbff\n"
>>> +		"	or	%3,%3\n"
>>> +		"	jz	disable\n"
>>> +		/* set DAT flag if enable == true */
>>> +		"	oilh	%1,0x0400\n"
>>> +		"	st	%1,0(%0)\n"
>>> +		"disable:\n"
>>
>> Shouldn't the "disable" label be placed before the "st" ?
> 
> yes indeed
>>
>>> +		"	st	%2,4(%0)\n"
>>> +		/* load the target address for our new PSW */
>>> +		"	larl	%1,cont\n"
>>> +		"	stg	%1,8(%0)\n"
>>> +		"	lpswe	0(%0)\n"
>>> +		"cont:\n"
>>> +		: : "a" (&psw), "r" (0), "r" (1), "r" (enable)
>>
>> The usage of "r" (0), "r" (1) looks wrong. I think you should rather put
>> them into the output list instead, since they are modified within the
>> assembler code. Or maybe even better, use fixed registers in the
>> assembler code and mark the corresponding registers in the clobber list.
> 
> I tried to use the output list way and it would not let me specify
> immediates (0). So I have to introduce local variables. Or is there
> another way?

Use "+r"(0) in the output list? ... if that does not work, I think I'd
rather use the clobber list instead.

 Thomas
Paolo Bonzini Jan. 11, 2018, 2:01 p.m. UTC | #4
On 10/01/2018 22:53, David Hildenbrand wrote:
>  
> +typedef uint64_t pgdval_t;		/* Region-1 table entry */
> +typedef uint64_t p4dval_t;		/* Region-2 table entry*/
> +typedef uint64_t pudval_t;		/* Region-3 table Entry */
> +typedef uint64_t pmdval_t;		/* Segment table entry */
> +typedef uint64_t pteval_t;		/* Page table entry */
> +
> +typedef struct { pgdval_t pgd; } pgd_t;
> +typedef struct { p4dval_t p4d; } p4d_t;
> +typedef struct { pudval_t pud; } pud_t;
> +typedef struct { pmdval_t pmd; } pmd_t;
> +typedef struct { pteval_t pte; } pte_t;
> +
> +#define pgd_val(x)	((x).pgd)
> +#define p4d_val(x)	((x).p4d)
> +#define pud_val(x)	((x).pud)
> +#define pmd_val(x)	((x).pmd)
> +#define pte_val(x)	((x).pte)
> +
> +#define __pgd(x)	((pgd_t) { (x) } )
> +#define __p4d(x)	((p4d_t) { (x) } )
> +#define __pud(x)	((pud_t) { (x) } )
> +#define __pmd(x)	((pmd_t) { (x) } )
> +#define __pte(x)	((pte_t) { (x) } )
> +

You don't need to do this.  Using recursive functions as in x86 is
perfectly okay (just use pte_t as an opaque struct for type safety) if
you prefer.

Paolo
Paolo Bonzini Jan. 11, 2018, 2:02 p.m. UTC | #5
On 11/01/2018 13:07, David Hildenbrand wrote:
>>>  
>>> +static inline void lctlg(int cr, uint64_t value)
>>> +{
>>> +	asm volatile(
>>> +		"	lctlg	%1,%1,%0\n"
>>> +		: : "Q" (value), "i" (cr));
>> Doesn't the compiler complain here? I though that "i" is only possible
>> with constants? I guess the compiler is smart enough to replace it with
>> the right constants, since this is an inline function. But maybe better
>> play safe and turn this into a macro instead.
> This works as it gets inlined. asm macros is frowned upon, no?
> 

Then please use always_inline.

Paolo
Paolo Bonzini Jan. 11, 2018, 2:03 p.m. UTC | #6
On 11/01/2018 13:24, Thomas Huth wrote:
>>>> +		"	st	%2,4(%0)\n"
>>>> +		/* load the target address for our new PSW */
>>>> +		"	larl	%1,cont\n"
>>>> +		"	stg	%1,8(%0)\n"
>>>> +		"	lpswe	0(%0)\n"
>>>> +		"cont:\n"
>>>> +		: : "a" (&psw), "r" (0), "r" (1), "r" (enable)
>>> The usage of "r" (0), "r" (1) looks wrong. I think you should rather put
>>> them into the output list instead, since they are modified within the
>>> assembler code. Or maybe even better, use fixed registers in the
>>> assembler code and mark the corresponding registers in the clobber list.
>> I tried to use the output list way and it would not let me specify
>> immediates (0). So I have to introduce local variables. Or is there
>> another way?
> Use "+r"(0) in the output list? ... if that does not work, I think I'd
> rather use the clobber list instead.

You'd have to introduce local variables, yes.  I think it's fine,
perhaps also more self-documenting than "0" and "1" to s390-impaired me.

Paolo
David Hildenbrand Jan. 11, 2018, 8:08 p.m. UTC | #7
On 11.01.2018 15:02, Paolo Bonzini wrote:
> On 11/01/2018 13:07, David Hildenbrand wrote:
>>>>  
>>>> +static inline void lctlg(int cr, uint64_t value)
>>>> +{
>>>> +	asm volatile(
>>>> +		"	lctlg	%1,%1,%0\n"
>>>> +		: : "Q" (value), "i" (cr));
>>> Doesn't the compiler complain here? I though that "i" is only possible
>>> with constants? I guess the compiler is smart enough to replace it with
>>> the right constants, since this is an inline function. But maybe better
>>> play safe and turn this into a macro instead.
>> This works as it gets inlined. asm macros is frowned upon, no?
>>
> 
> Then please use always_inline.

FYI, we already do the same in lib/s390x/asm/cpacf.h "opcode" without
always_inline and that is copied from the kernel sources.

Thanks

> 
> Paolo
>
Paolo Bonzini Jan. 12, 2018, 10:10 a.m. UTC | #8
On 10/01/2018 22:53, David Hildenbrand wrote:
> @@ -26,6 +26,7 @@ static uint64_t ram_size;
>  static void mem_init(phys_addr_t freemem_start, phys_addr_t mem_end)
>  {
>  	phys_alloc_init(freemem_start, ram_size - freemem_start);
> +	setup_vm();
>  }
>  
>  void sclp_memory_setup(void)

I'd leave setup_vm() to tests if not strictly necessary.

Paolo
David Hildenbrand Jan. 12, 2018, 10:33 a.m. UTC | #9
On 12.01.2018 11:10, Paolo Bonzini wrote:
> On 10/01/2018 22:53, David Hildenbrand wrote:
>> @@ -26,6 +26,7 @@ static uint64_t ram_size;
>>  static void mem_init(phys_addr_t freemem_start, phys_addr_t mem_end)
>>  {
>>  	phys_alloc_init(freemem_start, ram_size - freemem_start);
>> +	setup_vm();
>>  }
>>  
>>  void sclp_memory_setup(void)
> 
> I'd leave setup_vm() to tests if not strictly necessary.
> 
> Paolo
> 

Makes handling of program exceptions unnecessary complicated. (which
functions we're allowed to call etc.) And DAT can be easily disabled in
tests that require it instead.

So as long as there is no good reason for it, I prefer to keep it simple
and always set it up.
Andrew Jones Jan. 12, 2018, 12:17 p.m. UTC | #10
On Fri, Jan 12, 2018 at 11:33:55AM +0100, David Hildenbrand wrote:
> On 12.01.2018 11:10, Paolo Bonzini wrote:
> > On 10/01/2018 22:53, David Hildenbrand wrote:
> >> @@ -26,6 +26,7 @@ static uint64_t ram_size;
> >>  static void mem_init(phys_addr_t freemem_start, phys_addr_t mem_end)
> >>  {
> >>  	phys_alloc_init(freemem_start, ram_size - freemem_start);
> >> +	setup_vm();
> >>  }
> >>  
> >>  void sclp_memory_setup(void)
> > 
> > I'd leave setup_vm() to tests if not strictly necessary.
> > 
> > Paolo
> > 
> 
> Makes handling of program exceptions unnecessary complicated. (which
> functions we're allowed to call etc.) And DAT can be easily disabled in
> tests that require it instead.
> 
> So as long as there is no good reason for it, I prefer to keep it simple
> and always set it up.
>

I've been considering some changes for ARM that allow setup_vm() to be
conditionally skipped for unit tests that require that. I kicked around
several ideas as to what the condition should use. Currently I have an
auxinfo flag (added to a newly introduced flags field) which is set at
build time with a Makefile rule prototyped for it.

Long story, short; I think the way David is doing it now, which is like
ARM, is a fine start, and then if necessary we can extend the framework
to allow this auxinfo flag, or whatever, to give some flexibility to
unit tests that need it.

Thanks,
drew
Paolo Bonzini Jan. 12, 2018, 12:18 p.m. UTC | #11
On 12/01/2018 13:17, Andrew Jones wrote:
> On Fri, Jan 12, 2018 at 11:33:55AM +0100, David Hildenbrand wrote:
>> On 12.01.2018 11:10, Paolo Bonzini wrote:
>>> On 10/01/2018 22:53, David Hildenbrand wrote:
>>>> @@ -26,6 +26,7 @@ static uint64_t ram_size;
>>>>  static void mem_init(phys_addr_t freemem_start, phys_addr_t mem_end)
>>>>  {
>>>>  	phys_alloc_init(freemem_start, ram_size - freemem_start);
>>>> +	setup_vm();
>>>>  }
>>>>  
>>>>  void sclp_memory_setup(void)
>>>
>>> I'd leave setup_vm() to tests if not strictly necessary.
>>>
>>> Paolo
>>>
>>
>> Makes handling of program exceptions unnecessary complicated. (which
>> functions we're allowed to call etc.) And DAT can be easily disabled in
>> tests that require it instead.
>>
>> So as long as there is no good reason for it, I prefer to keep it simple
>> and always set it up.
>>
> 
> I've been considering some changes for ARM that allow setup_vm() to be
> conditionally skipped for unit tests that require that. I kicked around
> several ideas as to what the condition should use. Currently I have an
> auxinfo flag (added to a newly introduced flags field) which is set at
> build time with a Makefile rule prototyped for it.
> 
> Long story, short; I think the way David is doing it now, which is like
> ARM, is a fine start, and then if necessary we can extend the framework
> to allow this auxinfo flag, or whatever, to give some flexibility to
> unit tests that need it.

Yeah, that's okay.

Most x86 tests probably can use setup_vm unconditionally too.

Paolo
David Hildenbrand Jan. 12, 2018, 2:07 p.m. UTC | #12
On 11.01.2018 15:01, Paolo Bonzini wrote:
> On 10/01/2018 22:53, David Hildenbrand wrote:
>>  
>> +typedef uint64_t pgdval_t;		/* Region-1 table entry */
>> +typedef uint64_t p4dval_t;		/* Region-2 table entry*/
>> +typedef uint64_t pudval_t;		/* Region-3 table Entry */
>> +typedef uint64_t pmdval_t;		/* Segment table entry */
>> +typedef uint64_t pteval_t;		/* Page table entry */
>> +
>> +typedef struct { pgdval_t pgd; } pgd_t;
>> +typedef struct { p4dval_t p4d; } p4d_t;
>> +typedef struct { pudval_t pud; } pud_t;
>> +typedef struct { pmdval_t pmd; } pmd_t;
>> +typedef struct { pteval_t pte; } pte_t;
>> +
>> +#define pgd_val(x)	((x).pgd)
>> +#define p4d_val(x)	((x).p4d)
>> +#define pud_val(x)	((x).pud)
>> +#define pmd_val(x)	((x).pmd)
>> +#define pte_val(x)	((x).pte)
>> +
>> +#define __pgd(x)	((pgd_t) { (x) } )
>> +#define __p4d(x)	((p4d_t) { (x) } )
>> +#define __pud(x)	((pud_t) { (x) } )
>> +#define __pmd(x)	((pmd_t) { (x) } )
>> +#define __pte(x)	((pte_t) { (x) } )
>> +
> 
> You don't need to do this.  Using recursive functions as in x86 is
> perfectly okay (just use pte_t as an opaque struct for type safety) if
> you prefer.

I sticked to what arm does in their page.h

I assume you reference from Linux
	arch/x86/include/asm/pgtable_64_types.h

Which only have
	typedef struct { pteval_t pte; } pte_t;

I'll look into it, thanks!

> 
> Paolo
>
David Hildenbrand Jan. 12, 2018, 2:15 p.m. UTC | #13
On 11.01.2018 15:01, Paolo Bonzini wrote:
> On 10/01/2018 22:53, David Hildenbrand wrote:
>>  
>> +typedef uint64_t pgdval_t;		/* Region-1 table entry */
>> +typedef uint64_t p4dval_t;		/* Region-2 table entry*/
>> +typedef uint64_t pudval_t;		/* Region-3 table Entry */
>> +typedef uint64_t pmdval_t;		/* Segment table entry */
>> +typedef uint64_t pteval_t;		/* Page table entry */
>> +
>> +typedef struct { pgdval_t pgd; } pgd_t;
>> +typedef struct { p4dval_t p4d; } p4d_t;
>> +typedef struct { pudval_t pud; } pud_t;
>> +typedef struct { pmdval_t pmd; } pmd_t;
>> +typedef struct { pteval_t pte; } pte_t;
>> +
>> +#define pgd_val(x)	((x).pgd)
>> +#define p4d_val(x)	((x).p4d)
>> +#define pud_val(x)	((x).pud)
>> +#define pmd_val(x)	((x).pmd)
>> +#define pte_val(x)	((x).pte)
>> +
>> +#define __pgd(x)	((pgd_t) { (x) } )
>> +#define __p4d(x)	((p4d_t) { (x) } )
>> +#define __pud(x)	((pud_t) { (x) } )
>> +#define __pmd(x)	((pmd_t) { (x) } )
>> +#define __pte(x)	((pte_t) { (x) } )
>> +
> 
> You don't need to do this.  Using recursive functions as in x86 is
> perfectly okay (just use pte_t as an opaque struct for type safety) if
> you prefer.

Okay, looking at kvm-unit-tests, I think you were talking about struct
pte_search and find_pte_level().

I'd prefer to keep it as is for now, closer to what we do in the kernel
(and might come in handy once we want to construct broken page tables to
test the virtual address translation in QEMU/KVM).

Thanks!

> 
> Paolo
>

Patch
diff mbox

diff --git a/lib/s390x/asm/arch_def.h b/lib/s390x/asm/arch_def.h
index ae27ab2..416dc75 100644
--- a/lib/s390x/asm/arch_def.h
+++ b/lib/s390x/asm/arch_def.h
@@ -163,4 +163,49 @@  static inline int tprot(unsigned long addr)
 	return cc;
 }
 
+static inline void lctlg(int cr, uint64_t value)
+{
+	asm volatile(
+		"	lctlg	%1,%1,%0\n"
+		: : "Q" (value), "i" (cr));
+}
+
+static inline uint64_t stctg(int cr)
+{
+	uint64_t value;
+
+	asm volatile(
+		"	stctg	%1,%1,%0\n"
+		: "=Q" (value) : "i" (cr) : "memory");
+	return value;
+}
+
+static inline void configure_dat(int enable)
+{
+	struct psw psw = {
+		.mask = 0,
+		.addr = 0,
+	};
+
+	asm volatile(
+		/* fetch the old PSW masks */
+		"	epsw	%1,%2\n"
+		/* remove the DAT flag first */
+		"	nilh	%1,0xfbff\n"
+		"	or	%3,%3\n"
+		"	jz	disable\n"
+		/* set DAT flag if enable == true */
+		"	oilh	%1,0x0400\n"
+		"	st	%1,0(%0)\n"
+		"disable:\n"
+		"	st	%2,4(%0)\n"
+		/* load the target address for our new PSW */
+		"	larl	%1,cont\n"
+		"	stg	%1,8(%0)\n"
+		"	lpswe	0(%0)\n"
+		"cont:\n"
+		: : "a" (&psw), "r" (0), "r" (1), "r" (enable)
+		: "memory", "cc");
+}
+
 #endif
diff --git a/lib/s390x/asm/page.h b/lib/s390x/asm/page.h
index 141a456..e7cc16d 100644
--- a/lib/s390x/asm/page.h
+++ b/lib/s390x/asm/page.h
@@ -13,4 +13,28 @@ 
 
 #include <asm-generic/page.h>
 
+typedef uint64_t pgdval_t;		/* Region-1 table entry */
+typedef uint64_t p4dval_t;		/* Region-2 table entry*/
+typedef uint64_t pudval_t;		/* Region-3 table Entry */
+typedef uint64_t pmdval_t;		/* Segment table entry */
+typedef uint64_t pteval_t;		/* Page table entry */
+
+typedef struct { pgdval_t pgd; } pgd_t;
+typedef struct { p4dval_t p4d; } p4d_t;
+typedef struct { pudval_t pud; } pud_t;
+typedef struct { pmdval_t pmd; } pmd_t;
+typedef struct { pteval_t pte; } pte_t;
+
+#define pgd_val(x)	((x).pgd)
+#define p4d_val(x)	((x).p4d)
+#define pud_val(x)	((x).pud)
+#define pmd_val(x)	((x).pmd)
+#define pte_val(x)	((x).pte)
+
+#define __pgd(x)	((pgd_t) { (x) } )
+#define __p4d(x)	((p4d_t) { (x) } )
+#define __pud(x)	((pud_t) { (x) } )
+#define __pmd(x)	((pmd_t) { (x) } )
+#define __pte(x)	((pte_t) { (x) } )
+
 #endif
diff --git a/lib/s390x/asm/pgtable.h b/lib/s390x/asm/pgtable.h
new file mode 100644
index 0000000..88713b5
--- /dev/null
+++ b/lib/s390x/asm/pgtable.h
@@ -0,0 +1,222 @@ 
+/*
+ * s390x page table definitions and functions
+ *
+ * Copyright (c) 2017 Red Hat Inc
+ *
+ * Authors:
+ *  David Hildenbrand <david@redhat.com>
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Library General Public License version 2.
+ */
+#ifndef _ASMS390X_PGTABLE_H_
+#define _ASMS390X_PGTABLE_H_
+
+#include <asm/page.h>
+#include <alloc_page.h>
+
+#define ASCE_ORIGIN			0xfffffffffffff000UL
+#define ASCE_G				0x0000000000000200UL
+#define ASCE_P				0x0000000000000100UL
+#define ASCE_S				0x0000000000000080UL
+#define ASCE_X				0x0000000000000040UL
+#define ASCE_R				0x0000000000000020UL
+#define ASCE_DT				0x000000000000000cUL
+#define ASCE_TL				0x0000000000000003UL
+
+#define ASCE_DT_REGION1			0x000000000000000cUL
+#define ASCE_DT_REGION2			0x0000000000000008UL
+#define ASCE_DT_REGION3			0x0000000000000004UL
+#define ASCE_DT_SEGMENT			0x0000000000000000UL
+
+#define REGION_TABLE_ORDER		2
+#define REGION_TABLE_ENTRIES		2048
+#define REGION_TABLE_LENGTH		3
+
+#define REGION1_SHIFT			53
+#define REGION2_SHIFT			42
+#define REGION3_SHIFT			31
+
+#define REGION_ENTRY_ORIGIN		0xfffffffffffff000UL
+#define REGION_ENTRY_P			0x0000000000000200UL
+#define REGION_ENTRY_TF			0x00000000000000c0UL
+#define REGION_ENTRY_I			0x0000000000000020UL
+#define REGION_ENTRY_TT			0x000000000000000cUL
+#define REGION_ENTRY_TL			0x0000000000000003UL
+
+#define REGION_ENTRY_TT_REGION1		0x000000000000000cUL
+#define REGION_ENTRY_TT_REGION2		0x0000000000000008UL
+#define REGION_ENTRY_TT_REGION3		0x0000000000000004UL
+
+#define REGION3_ENTRY_RFAA		0xffffffff80000000UL
+#define REGION3_ENTRY_AV		0x0000000000010000UL
+#define REGION3_ENTRY_ACC		0x000000000000f000UL
+#define REGION3_ENTRY_F			0x0000000000000800UL
+#define REGION3_ENTRY_FC		0x0000000000000400UL
+#define REGION3_ENTRY_IEP		0x0000000000000100UL
+#define REGION3_ENTRY_CR		0x0000000000000010UL
+
+#define SEGMENT_TABLE_ORDER		2
+#define SEGMENT_TABLE_ENTRIES		2048
+#define SEGMENT_TABLE_LENGTH		3
+#define SEGMENT_SHIFT			20
+
+#define SEGMENT_ENTRY_ORIGIN		0xfffffffffffff800UL
+#define SEGMENT_ENTRY_SFAA		0xfffffffffff80000UL
+#define SEGMENT_ENTRY_AV		0x0000000000010000UL
+#define SEGMENT_ENTRY_ACC		0x000000000000f000UL
+#define SEGMENT_ENTRY_F			0x0000000000000800UL
+#define SEGMENT_ENTRY_FC		0x0000000000000400UL
+#define SEGMENT_ENTRY_P			0x0000000000000200UL
+#define SEGMENT_ENTRY_IEP		0x0000000000000100UL
+#define SEGMENT_ENTRY_I			0x0000000000000020UL
+#define SEGMENT_ENTRY_CS		0x0000000000000010UL
+#define SEGMENT_ENTRY_TT		0x000000000000000cUL
+
+#define SEGMENT_ENTRY_TT_REGION1	0x000000000000000cUL
+#define SEGMENT_ENTRY_TT_REGION2	0x0000000000000008UL
+#define SEGMENT_ENTRY_TT_REGION3	0x0000000000000004UL
+#define SEGMENT_ENTRY_TT_SEGMENT	0x0000000000000000UL
+
+#define PAGE_TABLE_ORDER		0
+#define PAGE_TABLE_ENTRIES		256
+
+#define PAGE_ENTRY_I			0x000000000000400UL
+#define PAGE_ENTRY_P			0x000000000000200UL
+#define PAGE_ENTRY_IEP			0x000000000000100UL
+
+#define PTRS_PER_PGD			REGION_TABLE_ENTRIES
+#define PTRS_PER_P4D			REGION_TABLE_ENTRIES
+#define PTRS_PER_PUD			REGION_TABLE_ENTRIES
+#define PTRS_PER_PMD			SEGMENT_TABLE_ENTRIES
+#define PTRS_PER_PTE			PAGE_TABLE_ENTRIES
+
+#define PGDIR_SHIFT			REGION1_SHIFT
+#define P4D_SHIFT			REGION2_SHIFT
+#define PUD_SHIFT			REGION3_SHIFT
+#define PMD_SHIFT			SEGMENT_SHIFT
+
+#define pgd_none(entry) (pgd_val(entry) & REGION_ENTRY_I)
+#define p4d_none(entry) (p4d_val(entry) & REGION_ENTRY_I)
+#define pud_none(entry) (pud_val(entry) & REGION_ENTRY_I)
+#define pmd_none(entry) (pmd_val(entry) & SEGMENT_ENTRY_I)
+#define pte_none(entry) (pte_val(entry) & PAGE_ENTRY_I)
+
+#define pgd_addr(entry) __va(pgd_val(entry) & REGION_ENTRY_ORIGIN)
+#define p4d_addr(entry) __va(p4d_val(entry) & REGION_ENTRY_ORIGIN)
+#define pud_addr(entry) __va(pud_val(entry) & REGION_ENTRY_ORIGIN)
+#define pmd_addr(entry) __va(pmd_val(entry) & SEGMENT_ENTRY_ORIGIN)
+#define pte_addr(entry) __va(pte_val(entry) & PAGE_MASK)
+
+#define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+#define p4d_index(addr) (((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1))
+#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+
+#define pgd_offset(table, addr) ((pgd_t *)(table) + pgd_index(addr))
+#define p4d_offset(pgd, addr) ((p4d_t *)pgd_addr(*(pgd)) + p4d_index(addr))
+#define pud_offset(p4d, addr) ((pud_t *)p4d_addr(*(p4d)) + pud_index(addr))
+#define pmd_offset(pud, addr) ((pmd_t *)pud_addr(*(pud)) + pmd_index(addr))
+#define pte_offset(pmd, addr) ((pte_t *)pmd_addr(*(pmd)) + pte_index(addr))
+
+static inline pgd_t *pgd_alloc_one(void)
+{
+	pgd_t *pgd = alloc_pages(REGION_TABLE_ORDER);
+	int i;
+
+	for (i = 0; i < REGION_TABLE_ENTRIES; i++)
+		pgd_val(pgd[i]) = REGION_ENTRY_TT_REGION1 | REGION_ENTRY_I;
+	return pgd;
+}
+
+static inline p4d_t *p4d_alloc_one(void)
+{
+	p4d_t *p4d = alloc_pages(REGION_TABLE_ORDER);
+	int i;
+
+	for (i = 0; i < REGION_TABLE_ENTRIES; i++)
+		p4d_val(p4d[i]) = REGION_ENTRY_TT_REGION2 | REGION_ENTRY_I;
+	return p4d;
+}
+
+static inline p4d_t *p4d_alloc(pgd_t *pgd, unsigned long addr)
+{
+	if (pgd_none(*pgd)) {
+		p4d_t *p4d = p4d_alloc_one();
+		pgd_val(*pgd) = __pa(p4d) | REGION_ENTRY_TT_REGION1 |
+				REGION_TABLE_LENGTH;
+	}
+	return p4d_offset(pgd, addr);
+}
+
+static inline pud_t *pud_alloc_one(void)
+{
+	pud_t *pud = alloc_pages(REGION_TABLE_ORDER);
+	int i;
+
+	for (i = 0; i < REGION_TABLE_ENTRIES; i++)
+		pud_val(pud[i]) = REGION_ENTRY_TT_REGION3 | REGION_ENTRY_I;
+	return pud;
+}
+
+static inline pud_t *pud_alloc(p4d_t *p4d, unsigned long addr)
+{
+	if (p4d_none(*p4d)) {
+		pud_t *pud = pud_alloc_one();
+		p4d_val(*p4d) = __pa(pud) | REGION_ENTRY_TT_REGION2 |
+				REGION_TABLE_LENGTH;
+	}
+	return pud_offset(p4d, addr);
+}
+
+static inline pmd_t *pmd_alloc_one(void)
+{
+	pmd_t *pmd = alloc_pages(SEGMENT_TABLE_ORDER);
+	int i;
+
+	for (i = 0; i < SEGMENT_TABLE_ENTRIES; i++)
+		pmd_val(pmd[i]) = SEGMENT_ENTRY_TT_SEGMENT | SEGMENT_ENTRY_I;
+	return pmd;
+}
+
+static inline pmd_t *pmd_alloc(pud_t *pud, unsigned long addr)
+{
+	if (pud_none(*pud)) {
+		pmd_t *pmd = pmd_alloc_one();
+		pud_val(*pud) = __pa(pmd) | REGION_ENTRY_TT_REGION3 |
+				REGION_TABLE_LENGTH;
+	}
+	return pmd_offset(pud, addr);
+}
+
+static inline pte_t *pte_alloc_one(void)
+{
+	pte_t *pte = alloc_pages(PAGE_TABLE_ORDER);
+	int i;
+
+	for (i = 0; i < PAGE_TABLE_ENTRIES; i++)
+		pte_val(pte[i]) = PAGE_ENTRY_I;
+	return pte;
+}
+
+static inline pte_t *pte_alloc(pmd_t *pmd, unsigned long addr)
+{
+	if (pmd_none(*pmd)) {
+		pte_t *pte = pte_alloc_one();
+		pmd_val(*pmd) = __pa(pte) | SEGMENT_ENTRY_TT_SEGMENT |
+				SEGMENT_TABLE_LENGTH;
+	}
+	return pte_offset(pmd, addr);
+}
+
+static inline void ipte(unsigned long vaddr, pteval_t *p_pte)
+{
+	unsigned long table_origin = (unsigned long)p_pte & PAGE_MASK;
+
+	asm volatile(
+		"	ipte %0,%1\n"
+		: : "a" (table_origin), "a" (vaddr) : "memory");
+}
+
+#endif /* _ASMS390X_PGTABLE_H_ */
diff --git a/lib/s390x/mmu.c b/lib/s390x/mmu.c
new file mode 100644
index 0000000..d4d1d06
--- /dev/null
+++ b/lib/s390x/mmu.c
@@ -0,0 +1,90 @@ 
+/*
+ * s390x MMU
+ *
+ * Copyright (c) 2017 Red Hat Inc
+ *
+ * Authors:
+ *  David Hildenbrand <david@redhat.com>
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Library General Public License version 2.
+ */
+
+#include <libcflat.h>
+#include <asm/pgtable.h>
+#include <asm/arch_def.h>
+#include <vmalloc.h>
+
+static void mmu_enable(pgd_t *pgtable)
+{
+	const uint64_t asce = __pa(pgtable) | ASCE_DT_REGION1 |
+			      REGION_TABLE_LENGTH;
+
+	/* set primary asce */
+	lctlg(1, asce);
+	assert(stctg(1) == asce);
+
+	/* enable dat (primary == 0 set as default) */
+	configure_dat(1);
+}
+
+static pteval_t *get_pte(pgd_t *pgtable, uintptr_t vaddr)
+{
+	pgd_t *pgd = pgd_offset(pgtable, vaddr);
+	p4d_t *p4d = p4d_alloc(pgd, vaddr);
+	pud_t *pud = pud_alloc(p4d, vaddr);
+	pmd_t *pmd = pmd_alloc(pud, vaddr);
+	pte_t *pte = pte_alloc(pmd, vaddr);
+
+	return &pte_val(*pte);
+}
+
+phys_addr_t virt_to_pte_phys(pgd_t *pgtable, void *vaddr)
+{
+	return (*get_pte(pgtable, (uintptr_t)vaddr) & PAGE_MASK) +
+	       ((unsigned long)vaddr & ~PAGE_MASK);
+}
+
+pteval_t *install_page(pgd_t *pgtable, phys_addr_t phys, void *vaddr)
+{
+	pteval_t *p_pte = get_pte(pgtable, (uintptr_t)vaddr);
+
+	/* first flush the old entry (if we're replacing anything) */
+	ipte((uintptr_t)vaddr, p_pte);
+
+	*p_pte = __pa(phys);
+	return p_pte;
+}
+
+static void setup_identity(pgd_t *pgtable, phys_addr_t start_addr,
+			   phys_addr_t end_addr)
+{
+	phys_addr_t cur;
+
+	start_addr &= PAGE_MASK;
+	for (cur = start_addr; true; cur += PAGE_SIZE) {
+		if (start_addr < end_addr && cur >= end_addr)
+			break;
+		if (start_addr > end_addr && cur <= end_addr)
+			break;
+		install_page(pgtable, cur, __va(cur));
+	}
+}
+
+void *setup_mmu(phys_addr_t phys_end){
+	pgd_t *page_root;
+
+	/* allocate a region-1 table */
+	page_root = pgd_alloc_one();
+
+	/* map all physical memory 1:1 */
+	setup_identity(page_root, 0, phys_end);
+
+	/* generate 128MB of invalid adresses at the end (for testing PGM) */
+	init_alloc_vpage((void *) -(1UL << 27));
+	setup_identity(page_root, -(1UL << 27), 0);
+
+	/* finally enable DAT with the new table */
+	mmu_enable(page_root);
+	return page_root;
+}
diff --git a/lib/s390x/sclp.c b/lib/s390x/sclp.c
index c0492b8..522c387 100644
--- a/lib/s390x/sclp.c
+++ b/lib/s390x/sclp.c
@@ -26,6 +26,7 @@  static uint64_t ram_size;
 static void mem_init(phys_addr_t freemem_start, phys_addr_t mem_end)
 {
 	phys_alloc_init(freemem_start, ram_size - freemem_start);
+	setup_vm();
 }
 
 void sclp_memory_setup(void)
diff --git a/s390x/Makefile b/s390x/Makefile
index 4198fdc..d9bef37 100644
--- a/s390x/Makefile
+++ b/s390x/Makefile
@@ -24,12 +24,14 @@  cflatobjs += lib/util.o
 cflatobjs += lib/alloc.o
 cflatobjs += lib/alloc_phys.o
 cflatobjs += lib/alloc_page.o
+cflatobjs += lib/vmalloc.o
 cflatobjs += lib/alloc_phys.o
 cflatobjs += lib/s390x/io.o
 cflatobjs += lib/s390x/stack.o
 cflatobjs += lib/s390x/sclp.o
 cflatobjs += lib/s390x/sclp-ascii.o
 cflatobjs += lib/s390x/interrupt.o
+cflatobjs += lib/s390x/mmu.o
 
 OBJDIRS += lib/s390x
 
diff --git a/s390x/cstart64.S b/s390x/cstart64.S
index 4d0c877..1f837e9 100644
--- a/s390x/cstart64.S
+++ b/s390x/cstart64.S
@@ -93,7 +93,8 @@  pgm_int:
 initital_psw:
 	.quad	0x0000000180000000, init_psw_cont
 pgm_int_psw:
-	.quad	0x0000000180000000, pgm_int
+	/* once we test PGM interrupts, we already have DAT configured */
+	.quad	0x0400000180000000, pgm_int
 initital_cr0:
 	/* enable AFP-register control, so FP regs (+BFP instr) can be used */
 	.quad	0x0000000000040000