diff mbox

[v7,8/9] ARM: vdso initialization, mapping, and synchronization

Message ID 1403493118-7597-9-git-send-email-nathan_lynch@mentor.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nathan Lynch June 23, 2014, 3:11 a.m. UTC
Initialize the vdso page list at boot, install the vdso mapping at
exec time, and update the data page during timer ticks.  This code is
not built if CONFIG_VDSO is not enabled.

Account for the vdso length when randomizing the offset from the
stack.  The vdso is placed immediately following the sigpage with a
separate install_special_mapping call in arm_install_vdso.

Signed-off-by: Nathan Lynch <nathan_lynch@mentor.com>
---
 arch/arm/kernel/process.c |  13 +++-
 arch/arm/kernel/vdso.c    | 168 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/kernel/vdso.c

Comments

Andy Lutomirski June 30, 2014, 9:37 p.m. UTC | #1
On 06/22/2014 08:11 PM, Nathan Lynch wrote:
> Initialize the vdso page list at boot, install the vdso mapping at
> exec time, and update the data page during timer ticks.  This code is
> not built if CONFIG_VDSO is not enabled.
> 
> Account for the vdso length when randomizing the offset from the
> stack.  The vdso is placed immediately following the sigpage with a
> separate install_special_mapping call in arm_install_vdso.
> 
> Signed-off-by: Nathan Lynch <nathan_lynch@mentor.com>
> ---
>  arch/arm/kernel/process.c |  13 +++-
>  arch/arm/kernel/vdso.c    | 168 ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 179 insertions(+), 2 deletions(-)
>  create mode 100644 arch/arm/kernel/vdso.c
> 
> diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
> index 40445fb71ac9..39b0d68aa068 100644
> --- a/arch/arm/kernel/process.c
> +++ b/arch/arm/kernel/process.c
> @@ -520,6 +520,7 @@ extern struct page *get_signal_page(void);
>  int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
>  {
>  	struct mm_struct *mm = current->mm;
> +	unsigned long npages;
>  	unsigned long addr;
>  	unsigned long hint;
>  	int ret;
> @@ -529,9 +530,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
>  	if (!signal_page)
>  		return -ENOMEM;
>  
> +	npages = (vdso_mapping_len >> PAGE_SHIFT) + 1;
> +
>  	down_write(&mm->mmap_sem);
> -	hint = vdso_addr(mm, 1);
> -	addr = get_unmapped_area(NULL, hint, PAGE_SIZE, 0, 0);
> +	hint = vdso_addr(mm, npages);
> +	addr = get_unmapped_area(NULL, hint, npages, 0, 0);
>  	if (IS_ERR_VALUE(addr)) {
>  		ret = addr;
>  		goto up_fail;
> @@ -544,6 +547,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
>  	if (ret == 0)
>  		mm->context.sigpage = addr;
>  
> +	/* Unlike the sigpage, failure to install the vdso is unlikely
> +	 * to be fatal to the process, so no error check needed
> +	 * here.
> +	 */
> +	arm_install_vdso(mm, addr + PAGE_SIZE);
> +
>   up_fail:
>  	up_write(&mm->mmap_sem);
>  	return ret;
> diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
> new file mode 100644
> index 000000000000..0cfd25f09adf
> --- /dev/null
> +++ b/arch/arm/kernel/vdso.c
> @@ -0,0 +1,168 @@
> +/*
> + * Adapted from arm64 version.
> + *
> + * Copyright (C) 2012 ARM Limited
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/err.h>
> +#include <linux/kernel.h>
> +#include <linux/mm.h>
> +#include <linux/slab.h>
> +#include <linux/timekeeper_internal.h>
> +#include <linux/vmalloc.h>
> +
> +#include <asm/barrier.h>
> +#include <asm/cacheflush.h>
> +#include <asm/page.h>
> +#include <asm/vdso.h>
> +#include <asm/vdso_datapage.h>
> +
> +static struct page **vdso_pagelist;
> +
> +unsigned long vdso_mapping_len __read_mostly;
> +
> +/*
> + * The vDSO data page.
> + */
> +static union vdso_data_store vdso_data_store __page_aligned_data;
> +static struct vdso_data *vdso_data = &vdso_data_store.data;
> +
> +static int __init vdso_init(void)
> +{
> +	unsigned long vdso_pages;
> +	int i;
> +
> +	if (memcmp(&vdso_start, "\177ELF", 4)) {
> +		pr_err("vDSO is not a valid ELF object!\n");
> +		return -ENOEXEC;
> +	}
> +
> +	vdso_pages = (&vdso_end - &vdso_start) >> PAGE_SHIFT;
> +	pr_debug("vdso: %ld code pages at base %p\n", vdso_pages, &vdso_start);
> +
> +	/* Allocate the vDSO pagelist, plus a page for the data. */
> +	vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *),
> +				GFP_KERNEL);
> +	if (vdso_pagelist == NULL)
> +		return -ENOMEM;
> +
> +	/* Grab the vDSO data page. */
> +	vdso_pagelist[0] = virt_to_page(vdso_data);
> +
> +	/* Grab the vDSO code pages. */
> +	for (i = 0; i < vdso_pages; i++)
> +		vdso_pagelist[i + 1] = virt_to_page(&vdso_start + i * PAGE_SIZE);
> +
> +	/* Precompute the mapping size */
> +	vdso_mapping_len = (vdso_pages + 1) << PAGE_SHIFT;
> +
> +	return 0;
> +}
> +arch_initcall(vdso_init);
> +
> +/* assumes mmap_sem is write-locked */
> +void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
> +{
> +	int ret;
> +
> +	mm->context.vdso = ~0UL;
> +
> +	if (vdso_pagelist == NULL)
> +		return;
> +
> +	/*
> +	 * Put vDSO base into mm struct before calling
> +	 * install_special_mapping so the perf counter mmap tracking
> +	 * code will recognise it as a vDSO.
> +	 */
> +	mm->context.vdso = addr;
> +
> +	ret = install_special_mapping(mm, addr, vdso_mapping_len,
> +				      VM_READ|VM_EXEC|
> +				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
> +				      vdso_pagelist);

Eek.  You're mapping the shared data VM_MAYWRITE.  This will cause
bizarre and confusing failures if ptrace pokes at it.  You also seem to
be sticking it *before* the vdso in the same vma.  This will severely
piss off all the tools that assume that "[vdso]" points to an ELF object.

x86 calls this thing "[vvar]" and sticks it after the vdso.  You might
not want to have the complexity of sticking it after the vdso (it's
distinctly nontrivial), but I see nothing wrong with giving it its own
vma just before the vdso.  The new _install_special_mapping function
makes it considerably easier to do.

(Don't use any version of x86's code before 3.16-rc3 as a reference.)

--Andy
Will Deacon July 1, 2014, 9:03 a.m. UTC | #2
On Mon, Jun 30, 2014 at 10:37:48PM +0100, Andy Lutomirski wrote:
> On 06/22/2014 08:11 PM, Nathan Lynch wrote:
> > Initialize the vdso page list at boot, install the vdso mapping at
> > exec time, and update the data page during timer ticks.  This code is
> > not built if CONFIG_VDSO is not enabled.
> > 
> > Account for the vdso length when randomizing the offset from the
> > stack.  The vdso is placed immediately following the sigpage with a
> > separate install_special_mapping call in arm_install_vdso.

[...]

> > +/* assumes mmap_sem is write-locked */
> > +void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
> > +{
> > +	int ret;
> > +
> > +	mm->context.vdso = ~0UL;
> > +
> > +	if (vdso_pagelist == NULL)
> > +		return;
> > +
> > +	/*
> > +	 * Put vDSO base into mm struct before calling
> > +	 * install_special_mapping so the perf counter mmap tracking
> > +	 * code will recognise it as a vDSO.
> > +	 */
> > +	mm->context.vdso = addr;
> > +
> > +	ret = install_special_mapping(mm, addr, vdso_mapping_len,
> > +				      VM_READ|VM_EXEC|
> > +				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
> > +				      vdso_pagelist);
> 
> Eek.  You're mapping the shared data VM_MAYWRITE.  This will cause
> bizarre and confusing failures if ptrace pokes at it.

Hmm, but how else can we support software breakpoints on the vdso?

Will
Nathan Lynch July 1, 2014, 2:01 p.m. UTC | #3
On 06/30/2014 04:37 PM, Andy Lutomirski wrote:
> On 06/22/2014 08:11 PM, Nathan Lynch wrote:
>> +void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
>> +{
>> +	int ret;
>> +
>> +	mm->context.vdso = ~0UL;
>> +
>> +	if (vdso_pagelist == NULL)
>> +		return;
>> +
>> +	/*
>> +	 * Put vDSO base into mm struct before calling
>> +	 * install_special_mapping so the perf counter mmap tracking
>> +	 * code will recognise it as a vDSO.
>> +	 */
>> +	mm->context.vdso = addr;
>> +
>> +	ret = install_special_mapping(mm, addr, vdso_mapping_len,
>> +				      VM_READ|VM_EXEC|
>> +				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
>> +				      vdso_pagelist);
> 
> Eek.  You're mapping the shared data VM_MAYWRITE.  This will cause
> bizarre and confusing failures if ptrace pokes at it.

I'm aware of that.  One could argue (as does the author of the
equivalent code in powerpc) that this is a "well, don't do that"
situation.  But I tend to agree that it would be nicer to prevent this
failure mode.


> You also seem to
> be sticking it *before* the vdso in the same vma.  This will severely
> piss off all the tools that assume that "[vdso]" points to an ELF object.

Hmm, which tools?  Shouldn't they be consulting AT_SYSINFO_EHDR in the
auxiliary vector instead?


> x86 calls this thing "[vvar]" and sticks it after the vdso.  You might
> not want to have the complexity of sticking it after the vdso (it's
> distinctly nontrivial), but I see nothing wrong with giving it its own
> vma just before the vdso.  The new _install_special_mapping function
> makes it considerably easier to do.

I'll give this a shot, thanks.
Andy Lutomirski July 1, 2014, 2:09 p.m. UTC | #4
On Tue, Jul 1, 2014 at 7:01 AM, Nathan Lynch <Nathan_Lynch@mentor.com> wrote:
> On 06/30/2014 04:37 PM, Andy Lutomirski wrote:
>> On 06/22/2014 08:11 PM, Nathan Lynch wrote:
>>> +void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
>>> +{
>>> +    int ret;
>>> +
>>> +    mm->context.vdso = ~0UL;
>>> +
>>> +    if (vdso_pagelist == NULL)
>>> +            return;
>>> +
>>> +    /*
>>> +     * Put vDSO base into mm struct before calling
>>> +     * install_special_mapping so the perf counter mmap tracking
>>> +     * code will recognise it as a vDSO.
>>> +     */
>>> +    mm->context.vdso = addr;
>>> +
>>> +    ret = install_special_mapping(mm, addr, vdso_mapping_len,
>>> +                                  VM_READ|VM_EXEC|
>>> +                                  VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
>>> +                                  vdso_pagelist);
>>
>> Eek.  You're mapping the shared data VM_MAYWRITE.  This will cause
>> bizarre and confusing failures if ptrace pokes at it.
>
> I'm aware of that.  One could argue (as does the author of the
> equivalent code in powerpc) that this is a "well, don't do that"
> situation.  But I tend to agree that it would be nicer to prevent this
> failure mode.
>
>
>> You also seem to
>> be sticking it *before* the vdso in the same vma.  This will severely
>> piss off all the tools that assume that "[vdso]" points to an ELF object.
>
> Hmm, which tools?  Shouldn't they be consulting AT_SYSINFO_EHDR in the
> auxiliary vector instead?
>

There were a bunch of random things written before getauxvec(3) was
introduced that look for "[vdso]"

gdb is also a bit weird.  I think it's changed a couple times, but
IIRC it looks for a vma that has a start address that matches
AT_SYSINFO_EDHR, and then it parses the ELF header and *section*
headers (sigh) in that vma.  So if I'm remembering correctly, it just
won't notice the vdso at all of AT_SYSINFO_EHDR and the vma start
aren't the same.

>
>> x86 calls this thing "[vvar]" and sticks it after the vdso.  You might
>> not want to have the complexity of sticking it after the vdso (it's
>> distinctly nontrivial), but I see nothing wrong with giving it its own
>> vma just before the vdso.  The new _install_special_mapping function
>> makes it considerably easier to do.
>
> I'll give this a shot, thanks.
>

--Andy
Nathan Lynch July 1, 2014, 2:11 p.m. UTC | #5
On 07/01/2014 04:03 AM, Will Deacon wrote:
> On Mon, Jun 30, 2014 at 10:37:48PM +0100, Andy Lutomirski wrote:
>> On 06/22/2014 08:11 PM, Nathan Lynch wrote:
>>> Initialize the vdso page list at boot, install the vdso mapping at
>>> exec time, and update the data page during timer ticks.  This code is
>>> not built if CONFIG_VDSO is not enabled.
>>>
>>> Account for the vdso length when randomizing the offset from the
>>> stack.  The vdso is placed immediately following the sigpage with a
>>> separate install_special_mapping call in arm_install_vdso.
> 
> [...]
> 
>>> +/* assumes mmap_sem is write-locked */
>>> +void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
>>> +{
>>> +	int ret;
>>> +
>>> +	mm->context.vdso = ~0UL;
>>> +
>>> +	if (vdso_pagelist == NULL)
>>> +		return;
>>> +
>>> +	/*
>>> +	 * Put vDSO base into mm struct before calling
>>> +	 * install_special_mapping so the perf counter mmap tracking
>>> +	 * code will recognise it as a vDSO.
>>> +	 */
>>> +	mm->context.vdso = addr;
>>> +
>>> +	ret = install_special_mapping(mm, addr, vdso_mapping_len,
>>> +				      VM_READ|VM_EXEC|
>>> +				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
>>> +				      vdso_pagelist);
>>
>> Eek.  You're mapping the shared data VM_MAYWRITE.  This will cause
>> bizarre and confusing failures if ptrace pokes at it.
> 
> Hmm, but how else can we support software breakpoints on the vdso?

I believe Andy is suggesting separate VMAs (with different VM flags) for
the VDSO's data and code.  So, breakpoints in code would work, but
attempts to modify the data page via ptrace() would fail outright
instead of silently COWing.
Russell King - ARM Linux July 1, 2014, 2:14 p.m. UTC | #6
On Tue, Jul 01, 2014 at 07:09:46AM -0700, Andy Lutomirski wrote:
> On Tue, Jul 1, 2014 at 7:01 AM, Nathan Lynch <Nathan_Lynch@mentor.com> wrote:
> > Hmm, which tools?  Shouldn't they be consulting AT_SYSINFO_EHDR in the
> > auxiliary vector instead?
> >
> 
> There were a bunch of random things written before getauxvec(3) was
> introduced that look for "[vdso]"
> 
> gdb is also a bit weird.  I think it's changed a couple times, but
> IIRC it looks for a vma that has a start address that matches
> AT_SYSINFO_EDHR, and then it parses the ELF header and *section*
> headers (sigh) in that vma.  So if I'm remembering correctly, it just
> won't notice the vdso at all of AT_SYSINFO_EHDR and the vma start
> aren't the same.

Hmm, that would explain the weird error I got when trying gdb with the
vdso in place.
Will Deacon July 1, 2014, 2:15 p.m. UTC | #7
On Tue, Jul 01, 2014 at 03:11:04PM +0100, Nathan Lynch wrote:
> On 07/01/2014 04:03 AM, Will Deacon wrote:
> > On Mon, Jun 30, 2014 at 10:37:48PM +0100, Andy Lutomirski wrote:
> >> On 06/22/2014 08:11 PM, Nathan Lynch wrote:
> >>> Initialize the vdso page list at boot, install the vdso mapping at
> >>> exec time, and update the data page during timer ticks.  This code is
> >>> not built if CONFIG_VDSO is not enabled.
> >>>
> >>> Account for the vdso length when randomizing the offset from the
> >>> stack.  The vdso is placed immediately following the sigpage with a
> >>> separate install_special_mapping call in arm_install_vdso.
> > 
> > [...]
> > 
> >>> +/* assumes mmap_sem is write-locked */
> >>> +void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
> >>> +{
> >>> +	int ret;
> >>> +
> >>> +	mm->context.vdso = ~0UL;
> >>> +
> >>> +	if (vdso_pagelist == NULL)
> >>> +		return;
> >>> +
> >>> +	/*
> >>> +	 * Put vDSO base into mm struct before calling
> >>> +	 * install_special_mapping so the perf counter mmap tracking
> >>> +	 * code will recognise it as a vDSO.
> >>> +	 */
> >>> +	mm->context.vdso = addr;
> >>> +
> >>> +	ret = install_special_mapping(mm, addr, vdso_mapping_len,
> >>> +				      VM_READ|VM_EXEC|
> >>> +				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
> >>> +				      vdso_pagelist);
> >>
> >> Eek.  You're mapping the shared data VM_MAYWRITE.  This will cause
> >> bizarre and confusing failures if ptrace pokes at it.
> > 
> > Hmm, but how else can we support software breakpoints on the vdso?
> 
> I believe Andy is suggesting separate VMAs (with different VM flags) for
> the VDSO's data and code.  So, breakpoints in code would work, but
> attempts to modify the data page via ptrace() would fail outright
> instead of silently COWing.

Ah, yes. That makes a lot of sense for the data page -- we should do
something similar on arm64 too, since the CoW will break everything for the
task being debugged. We could also drop the EXEC flags too.

Thanks for the explanation,

Will
Andy Lutomirski July 1, 2014, 2:17 p.m. UTC | #8
On Tue, Jul 1, 2014 at 7:15 AM, Will Deacon <will.deacon@arm.com> wrote:
> On Tue, Jul 01, 2014 at 03:11:04PM +0100, Nathan Lynch wrote:
>> On 07/01/2014 04:03 AM, Will Deacon wrote:
>> > On Mon, Jun 30, 2014 at 10:37:48PM +0100, Andy Lutomirski wrote:
>> >> On 06/22/2014 08:11 PM, Nathan Lynch wrote:
>> >>> Initialize the vdso page list at boot, install the vdso mapping at
>> >>> exec time, and update the data page during timer ticks.  This code is
>> >>> not built if CONFIG_VDSO is not enabled.
>> >>>
>> >>> Account for the vdso length when randomizing the offset from the
>> >>> stack.  The vdso is placed immediately following the sigpage with a
>> >>> separate install_special_mapping call in arm_install_vdso.
>> >
>> > [...]
>> >
>> >>> +/* assumes mmap_sem is write-locked */
>> >>> +void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
>> >>> +{
>> >>> + int ret;
>> >>> +
>> >>> + mm->context.vdso = ~0UL;
>> >>> +
>> >>> + if (vdso_pagelist == NULL)
>> >>> +         return;
>> >>> +
>> >>> + /*
>> >>> +  * Put vDSO base into mm struct before calling
>> >>> +  * install_special_mapping so the perf counter mmap tracking
>> >>> +  * code will recognise it as a vDSO.
>> >>> +  */
>> >>> + mm->context.vdso = addr;
>> >>> +
>> >>> + ret = install_special_mapping(mm, addr, vdso_mapping_len,
>> >>> +                               VM_READ|VM_EXEC|
>> >>> +                               VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
>> >>> +                               vdso_pagelist);
>> >>
>> >> Eek.  You're mapping the shared data VM_MAYWRITE.  This will cause
>> >> bizarre and confusing failures if ptrace pokes at it.
>> >
>> > Hmm, but how else can we support software breakpoints on the vdso?
>>
>> I believe Andy is suggesting separate VMAs (with different VM flags) for
>> the VDSO's data and code.  So, breakpoints in code would work, but
>> attempts to modify the data page via ptrace() would fail outright
>> instead of silently COWing.
>
> Ah, yes. That makes a lot of sense for the data page -- we should do
> something similar on arm64 too, since the CoW will break everything for the
> task being debugged. We could also drop the EXEC flags too.

If you do this, I have a slight preference for the new vma being
called "[vvar]" to match x86.  It'll make the CRIU people happy if and
when they port it to ARM.

--Andy
Christopher Covington July 1, 2014, 5:27 p.m. UTC | #9
On 07/01/2014 10:17 AM, Andy Lutomirski wrote:
> On Tue, Jul 1, 2014 at 7:15 AM, Will Deacon <will.deacon@arm.com> wrote:
>> On Tue, Jul 01, 2014 at 03:11:04PM +0100, Nathan Lynch wrote:
>>> On 07/01/2014 04:03 AM, Will Deacon wrote:
>>>> On Mon, Jun 30, 2014 at 10:37:48PM +0100, Andy Lutomirski wrote:
>>>>> On 06/22/2014 08:11 PM, Nathan Lynch wrote:
>>>>>> Initialize the vdso page list at boot, install the vdso mapping at
>>>>>> exec time, and update the data page during timer ticks.  This code is
>>>>>> not built if CONFIG_VDSO is not enabled.
>>>>>>
>>>>>> Account for the vdso length when randomizing the offset from the
>>>>>> stack.  The vdso is placed immediately following the sigpage with a
>>>>>> separate install_special_mapping call in arm_install_vdso.
>>>>
>>>> [...]
>>>>
>>>>>> +/* assumes mmap_sem is write-locked */
>>>>>> +void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
>>>>>> +{
>>>>>> + int ret;
>>>>>> +
>>>>>> + mm->context.vdso = ~0UL;
>>>>>> +
>>>>>> + if (vdso_pagelist == NULL)
>>>>>> +         return;
>>>>>> +
>>>>>> + /*
>>>>>> +  * Put vDSO base into mm struct before calling
>>>>>> +  * install_special_mapping so the perf counter mmap tracking
>>>>>> +  * code will recognise it as a vDSO.
>>>>>> +  */
>>>>>> + mm->context.vdso = addr;
>>>>>> +
>>>>>> + ret = install_special_mapping(mm, addr, vdso_mapping_len,
>>>>>> +                               VM_READ|VM_EXEC|
>>>>>> +                               VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
>>>>>> +                               vdso_pagelist);
>>>>>
>>>>> Eek.  You're mapping the shared data VM_MAYWRITE.  This will cause
>>>>> bizarre and confusing failures if ptrace pokes at it.
>>>>
>>>> Hmm, but how else can we support software breakpoints on the vdso?
>>>
>>> I believe Andy is suggesting separate VMAs (with different VM flags) for
>>> the VDSO's data and code.  So, breakpoints in code would work, but
>>> attempts to modify the data page via ptrace() would fail outright
>>> instead of silently COWing.
>>
>> Ah, yes. That makes a lot of sense for the data page -- we should do
>> something similar on arm64 too, since the CoW will break everything for the
>> task being debugged. We could also drop the EXEC flags too.
> 
> If you do this, I have a slight preference for the new vma being
> called "[vvar]" to match x86.  It'll make the CRIU people happy if and
> when they port it to ARM.

CRIU is functional on AArch32 and AArch64. I use norandmaps and identical
kernels so I've not needed explicit VDSO support (on A64), but it'd certainly
be nice for whoever does need it for things to be as much the same as
possible. Here is some description of it:

http://git.criu.org/?p=criu.git;a=commit;h=fe7b8aeb8c65e11e190282aae8db2d2bebd4f6e9

Christopher
diff mbox

Patch

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 40445fb71ac9..39b0d68aa068 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -520,6 +520,7 @@  extern struct page *get_signal_page(void);
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
 	struct mm_struct *mm = current->mm;
+	unsigned long npages;
 	unsigned long addr;
 	unsigned long hint;
 	int ret;
@@ -529,9 +530,11 @@  int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	if (!signal_page)
 		return -ENOMEM;
 
+	npages = (vdso_mapping_len >> PAGE_SHIFT) + 1;
+
 	down_write(&mm->mmap_sem);
-	hint = vdso_addr(mm, 1);
-	addr = get_unmapped_area(NULL, hint, PAGE_SIZE, 0, 0);
+	hint = vdso_addr(mm, npages);
+	addr = get_unmapped_area(NULL, hint, npages, 0, 0);
 	if (IS_ERR_VALUE(addr)) {
 		ret = addr;
 		goto up_fail;
@@ -544,6 +547,12 @@  int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	if (ret == 0)
 		mm->context.sigpage = addr;
 
+	/* Unlike the sigpage, failure to install the vdso is unlikely
+	 * to be fatal to the process, so no error check needed
+	 * here.
+	 */
+	arm_install_vdso(mm, addr + PAGE_SIZE);
+
  up_fail:
 	up_write(&mm->mmap_sem);
 	return ret;
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
new file mode 100644
index 000000000000..0cfd25f09adf
--- /dev/null
+++ b/arch/arm/kernel/vdso.c
@@ -0,0 +1,168 @@ 
+/*
+ * Adapted from arm64 version.
+ *
+ * Copyright (C) 2012 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/vmalloc.h>
+
+#include <asm/barrier.h>
+#include <asm/cacheflush.h>
+#include <asm/page.h>
+#include <asm/vdso.h>
+#include <asm/vdso_datapage.h>
+
+static struct page **vdso_pagelist;
+
+unsigned long vdso_mapping_len __read_mostly;
+
+/*
+ * The vDSO data page.
+ */
+static union vdso_data_store vdso_data_store __page_aligned_data;
+static struct vdso_data *vdso_data = &vdso_data_store.data;
+
+static int __init vdso_init(void)
+{
+	unsigned long vdso_pages;
+	int i;
+
+	if (memcmp(&vdso_start, "\177ELF", 4)) {
+		pr_err("vDSO is not a valid ELF object!\n");
+		return -ENOEXEC;
+	}
+
+	vdso_pages = (&vdso_end - &vdso_start) >> PAGE_SHIFT;
+	pr_debug("vdso: %ld code pages at base %p\n", vdso_pages, &vdso_start);
+
+	/* Allocate the vDSO pagelist, plus a page for the data. */
+	vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *),
+				GFP_KERNEL);
+	if (vdso_pagelist == NULL)
+		return -ENOMEM;
+
+	/* Grab the vDSO data page. */
+	vdso_pagelist[0] = virt_to_page(vdso_data);
+
+	/* Grab the vDSO code pages. */
+	for (i = 0; i < vdso_pages; i++)
+		vdso_pagelist[i + 1] = virt_to_page(&vdso_start + i * PAGE_SIZE);
+
+	/* Precompute the mapping size */
+	vdso_mapping_len = (vdso_pages + 1) << PAGE_SHIFT;
+
+	return 0;
+}
+arch_initcall(vdso_init);
+
+/* assumes mmap_sem is write-locked */
+void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
+{
+	int ret;
+
+	mm->context.vdso = ~0UL;
+
+	if (vdso_pagelist == NULL)
+		return;
+
+	/*
+	 * Put vDSO base into mm struct before calling
+	 * install_special_mapping so the perf counter mmap tracking
+	 * code will recognise it as a vDSO.
+	 */
+	mm->context.vdso = addr;
+
+	ret = install_special_mapping(mm, addr, vdso_mapping_len,
+				      VM_READ|VM_EXEC|
+				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+				      vdso_pagelist);
+	if (ret) {
+		pr_notice_once("%s: install_special_mapping failed (%d)\n",
+			       __func__, ret);
+		mm->context.vdso = ~0UL;
+		return;
+	}
+}
+
+static void vdso_write_begin(struct vdso_data *vdata)
+{
+	++vdso_data->seq_count;
+	smp_wmb();
+}
+
+static void vdso_write_end(struct vdso_data *vdata)
+{
+	smp_wmb();
+	++vdso_data->seq_count;
+}
+
+/**
+ * update_vsyscall - update the vdso data page
+ *
+ * Increment the sequence counter, making it odd, indicating to
+ * userspace that an update is in progress.  Update the fields used
+ * for coarse clocks and, if the architected system timer is in use,
+ * the fields used for high precision clocks.  Increment the sequence
+ * counter again, making it even, indicating to userspace that the
+ * update is finished.
+ *
+ * Userspace is expected to sample seq_count before reading any other
+ * fields from the data page.  If seq_count is odd, userspace is
+ * expected to wait until it becomes even.  After copying data from
+ * the page, userspace must sample seq_count again; if it has changed
+ * from its previous value, userspace must retry the whole sequence.
+ *
+ * Calls to update_vsyscall are serialized by the timekeeping core.
+ */
+void update_vsyscall(struct timekeeper *tk)
+{
+	struct timespec xtime_coarse;
+	struct timespec *wtm = &tk->wall_to_monotonic;
+	bool use_syscall = strcmp(tk->clock->name, "arch_sys_counter");
+
+	vdso_write_begin(vdso_data);
+
+	xtime_coarse = __current_kernel_time();
+	vdso_data->use_syscall			= use_syscall;
+	vdso_data->xtime_coarse_sec		= xtime_coarse.tv_sec;
+	vdso_data->xtime_coarse_nsec		= xtime_coarse.tv_nsec;
+	vdso_data->wtm_clock_sec		= wtm->tv_sec;
+	vdso_data->wtm_clock_nsec		= wtm->tv_nsec;
+
+	if (!use_syscall) {
+		vdso_data->cs_cycle_last	= tk->cycle_last;
+		vdso_data->xtime_clock_sec	= tk->xtime_sec;
+		vdso_data->xtime_clock_snsec	= tk->xtime_nsec;
+		vdso_data->cs_mult		= tk->mult;
+		vdso_data->cs_shift		= tk->shift;
+		vdso_data->cs_mask		= tk->clock->mask;
+	}
+
+	vdso_write_end(vdso_data);
+
+	flush_dcache_page(virt_to_page(vdso_data));
+}
+
+void update_vsyscall_tz(void)
+{
+	vdso_data->tz_minuteswest	= sys_tz.tz_minuteswest;
+	vdso_data->tz_dsttime		= sys_tz.tz_dsttime;
+	flush_dcache_page(virt_to_page(vdso_data));
+}