diff mbox series

mm: fix maxnode for mbind(), set_mempolicy() and migrate_pages()

Message ID 20240720173543.897972-1-jglisse@google.com (mailing list archive)
State New
Headers show
Series mm: fix maxnode for mbind(), set_mempolicy() and migrate_pages() | expand

Commit Message

Jerome Glisse July 20, 2024, 5:35 p.m. UTC
Because maxnode bug there is no way to bind or migrate_pages to the
last node in multi-node NUMA system unless you lie about maxnodes
when making the mbind, set_mempolicy or migrate_pages syscall.

Manpage for those syscall describe maxnodes as the number of bits in
the node bitmap ("bit mask of nodes containing up to maxnode bits").
Thus if maxnode is n then we expect to have a n bit(s) bitmap which
means that the mask of valid bits is ((1 << n) - 1). The get_nodes()
decrement lead to the mask being ((1 << (n - 1)) - 1).

The three syscalls use a common helper get_nodes() and first things
this helper do is decrement maxnode by 1 which leads to using n-1 bits
in the provided mask of nodes (see get_bitmap() an helper function to
get_nodes()).

The lead to two bugs, either the last node in the bitmap provided will
not be use in either of the three syscalls, or the syscalls will error
out and return EINVAL if the only bit set in the bitmap was the last
bit in the mask of nodes (which is ignored because of the bug and an
empty mask of nodes is an invalid argument).

I am surprised this bug was never caught ... it has been in the kernel
since forever.

People can use the following function to detect if the kernel has the
bug:

bool kernel_has_maxnodes_bug(void)
{
    unsigned long nodemask = 1;
    bool has_bug;
    long res;

    res = set_mempolicy(MPOL_BIND, &nodemask, 1);
    has_bug = res && (errno == EINVAL);
    set_mempolicy(MPOL_DEFAULT, NULL, 0);
    return has_bug;
}

You can tested with any of the three program below:

gcc mbind.c -o mbind -lnuma
gcc set_mempolicy.c -o set_mempolicy -lnuma
gcc migrate_pages.c -o migrate_pages -lnuma

First argument is maxnode, second argument is the bit index to set in
the mask of node (0 set the first bit, 1 the second bit, ...).

./mbind 2 1 & sleep 2 && numastat -n -p `pidof mbind` && fg
./set_mempolicy 2 1 & sleep 2 && numastat -n -p `pidof set_mempolicy` && fg
./migrate_pages 2 1 & sleep 2 && numastat -n -p `pidof migrate_pages` && fg

mbind.c %< ----------------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *nodemask, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    nodemask = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !nodemask) {
        return -1;
    }

    // Try to bind memory to node
    bitmap_set(nodemask, node);
    res = mbind(mem, NPAGES << 12, MPOL_BIND,
                nodemask, maxnode, 0);
    if (res) {
        printf("mbind(mem, NPAGES << 12, MPOL_BIND, "
               "nodemask, %d, 0) failed with %d\n",
               maxnode, errno);
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

set_mempolicy %< ----------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *nodemask, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    // bind memory to node 0 ...
    i = 1;
    res = set_mempolicy(MPOL_BIND, i, 2);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, []=1, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    nodemask = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !nodemask) {
        return -1;
    }

    // Try to bind memory to node
    bitmap_set(nodemask, node);
    res = set_mempolicy(MPOL_BIND, nodemask, maxnode);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, nodemask, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

migrate_pages %< ----------------------------------------------------

void *anon_mem(size_t size)
{
    void *ret;

    ret = mmap(NULL, size, PROT_READ|
               PROT_WRITE, MAP_PRIVATE|
               MAP_ANON, -1, 0);
    return ret == MAP_FAILED ? NULL : ret;
}

unsigned long mround(unsigned long v, unsigned long m)
{
    if (m == 0) {
        return v;
    }

    return v + m - (v % m);
}

void bitmap_set(void *_bitmap, unsigned long b)
{
    uint8_t *bitmap = _bitmap;

    bitmap[b >> 3] |= (1 << (b & 7));
}

int main(int argc, char *argv[])
{
    unsigned long *old_nodes, *new_nodes, maxnode, node, i;
    size_t bytes;
    int8_t *mem;
    long res;

    if (argv[1] == NULL || argv[2] == NULL) {
        printf("missing argument: %s maxnodes node\n", argv[0]);
        return -1;
    }
    maxnode = atoi(argv[1]);
    node = atoi(argv[2]);

    // bind memory to node 0 ...
    i = 1;
    res = set_mempolicy(MPOL_BIND, &i, 2);
    if (res) {
        printf("set_mempolicy(MPOL_BIND, []=1, %d) "
               "failed with %d\n", maxnode, errno);
        return -1;
    }

    bytes = mround(mround(maxnode, 8) >> 3,
                   sizeof(unsigned long));
    old_nodes = calloc(bytes, 1);
    new_nodes = calloc(bytes, 1);
    mem = anon_mem(NPAGES << 12);
    if (!mem || !new_nodes || !old_nodes) {
        return -1;
    }

    // Write something to breakup from the zero page
    for (unsigned i = 0; i < NPAGES; i++) {
        mem[i << 12] = i + 1;
    }

    // Try to bind memory to node
    bitmap_set(old_nodes, 0);
    bitmap_set(new_nodes, node);
    res = migrate_pages(getpid(), maxnode,
                        old_nodes, new_nodes);
    if (res) {
        printf("migrate_pages(pid, %d, old_nodes, "
               "new_nodes) failed with %d\n",
               maxnode, errno);
        return -1;
    }

    // Allow numastats to gather statistics
    getchar();

    return 0;
}

Signed-off-by: Jérôme Glisse <jglisse@google.com>
To: Andrew Morton <akpm@linux-foundation.org>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
---
 mm/mempolicy.c | 1 -
 1 file changed, 1 deletion(-)

Comments

Matthew Wilcox July 20, 2024, 5:55 p.m. UTC | #1
On Sat, Jul 20, 2024 at 10:35:43AM -0700, Jerome Glisse wrote:
> You can tested with any of the three program below:

Do we really need three programs in the commit message, for a total of
287 lines for a one-line change?
Gregory Price July 22, 2024, 9:21 p.m. UTC | #2
On Sat, Jul 20, 2024 at 06:55:52PM +0100, Matthew Wilcox wrote:
> On Sat, Jul 20, 2024 at 10:35:43AM -0700, Jerome Glisse wrote:
> > You can tested with any of the three program below:
> 
> Do we really need three programs in the commit message, for a total of
> 287 lines for a one-line change?

Better yet - why not a ktest / LTP contribution?

~Gregory
Jerome Glisse July 23, 2024, 4:19 p.m. UTC | #3
On Mon, 22 Jul 2024 at 06:09, David Hildenbrand <david@redhat.com> wrote:

> On 20.07.24 19:35, Jerome Glisse wrote:
> > Because maxnode bug there is no way to bind or migrate_pages to the
> > last node in multi-node NUMA system unless you lie about maxnodes
> > when making the mbind, set_mempolicy or migrate_pages syscall.
> >
> > Manpage for those syscall describe maxnodes as the number of bits in
> > the node bitmap ("bit mask of nodes containing up to maxnode bits").
> > Thus if maxnode is n then we expect to have a n bit(s) bitmap which
> > means that the mask of valid bits is ((1 << n) - 1). The get_nodes()
> > decrement lead to the mask being ((1 << (n - 1)) - 1).
> >
> > The three syscalls use a common helper get_nodes() and first things
> > this helper do is decrement maxnode by 1 which leads to using n-1 bits
> > in the provided mask of nodes (see get_bitmap() an helper function to
> > get_nodes()).
> >
> > The lead to two bugs, either the last node in the bitmap provided will
> > not be use in either of the three syscalls, or the syscalls will error
> > out and return EINVAL if the only bit set in the bitmap was the last
> > bit in the mask of nodes (which is ignored because of the bug and an
> > empty mask of nodes is an invalid argument).
> >
> > I am surprised this bug was never caught ... it has been in the kernel
> > since forever.
>
> Let's look at QEMU: backends/hostmem.c
>
>      /*
>       * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
>       * as argument to mbind() due to an old Linux bug (feature?) which
>       * cuts off the last specified node. This means backend->host_nodes
>       * must have MAX_NODES+1 bits available.
>       */
>
> Which means that it's been known for a long time, and the workaround
> seems to be pretty easy.
>
> So I wonder if we rather want to update the documentation to match reality.
>

I think it is kind of weird if we ask to supply maxnodes+1 to work around
the bug. If we apply this patch qemu would continue to work as is while
fixing users that were not aware of that bug. So I would say applying this
patch does more good. Long term qemu can drop its workaround or keep it for
backward compatibility with old kernel.
Jerome Glisse July 23, 2024, 4:33 p.m. UTC | #4
On Mon, 22 Jul 2024 at 06:09, David Hildenbrand <david@redhat.com> wrote:
>
> On 20.07.24 19:35, Jerome Glisse wrote:
> > Because maxnode bug there is no way to bind or migrate_pages to the
> > last node in multi-node NUMA system unless you lie about maxnodes
> > when making the mbind, set_mempolicy or migrate_pages syscall.
> >
> > Manpage for those syscall describe maxnodes as the number of bits in
> > the node bitmap ("bit mask of nodes containing up to maxnode bits").
> > Thus if maxnode is n then we expect to have a n bit(s) bitmap which
> > means that the mask of valid bits is ((1 << n) - 1). The get_nodes()
> > decrement lead to the mask being ((1 << (n - 1)) - 1).
> >
> > The three syscalls use a common helper get_nodes() and first things
> > this helper do is decrement maxnode by 1 which leads to using n-1 bits
> > in the provided mask of nodes (see get_bitmap() an helper function to
> > get_nodes()).
> >
> > The lead to two bugs, either the last node in the bitmap provided will
> > not be use in either of the three syscalls, or the syscalls will error
> > out and return EINVAL if the only bit set in the bitmap was the last
> > bit in the mask of nodes (which is ignored because of the bug and an
> > empty mask of nodes is an invalid argument).
> >
> > I am surprised this bug was never caught ... it has been in the kernel
> > since forever.
>
> Let's look at QEMU: backends/hostmem.c
>
>      /*
>       * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
>       * as argument to mbind() due to an old Linux bug (feature?) which
>       * cuts off the last specified node. This means backend->host_nodes
>       * must have MAX_NODES+1 bits available.
>       */
>
> Which means that it's been known for a long time, and the workaround
> seems to be pretty easy.
>
> So I wonder if we rather want to update the documentation to match reality.

[Sorry resending as text ... gmail insanity]

I think it is kind of weird if we ask to supply maxnodes+1 to work
around the bug. If we apply this patch qemu would continue to work as
is while fixing users that were not aware of that bug. So I would say
applying this patch does more good. Long term qemu can drop its
workaround or keep it for backward compatibility with old kernel.

Thank you,
Jérôme
David Hildenbrand July 23, 2024, 5:37 p.m. UTC | #5
On 23.07.24 18:33, Jerome Glisse wrote:
> On Mon, 22 Jul 2024 at 06:09, David Hildenbrand <david@redhat.com> wrote:
>>
>> On 20.07.24 19:35, Jerome Glisse wrote:
>>> Because maxnode bug there is no way to bind or migrate_pages to the
>>> last node in multi-node NUMA system unless you lie about maxnodes
>>> when making the mbind, set_mempolicy or migrate_pages syscall.
>>>
>>> Manpage for those syscall describe maxnodes as the number of bits in
>>> the node bitmap ("bit mask of nodes containing up to maxnode bits").
>>> Thus if maxnode is n then we expect to have a n bit(s) bitmap which
>>> means that the mask of valid bits is ((1 << n) - 1). The get_nodes()
>>> decrement lead to the mask being ((1 << (n - 1)) - 1).
>>>
>>> The three syscalls use a common helper get_nodes() and first things
>>> this helper do is decrement maxnode by 1 which leads to using n-1 bits
>>> in the provided mask of nodes (see get_bitmap() an helper function to
>>> get_nodes()).
>>>
>>> The lead to two bugs, either the last node in the bitmap provided will
>>> not be use in either of the three syscalls, or the syscalls will error
>>> out and return EINVAL if the only bit set in the bitmap was the last
>>> bit in the mask of nodes (which is ignored because of the bug and an
>>> empty mask of nodes is an invalid argument).
>>>
>>> I am surprised this bug was never caught ... it has been in the kernel
>>> since forever.
>>
>> Let's look at QEMU: backends/hostmem.c
>>
>>       /*
>>        * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
>>        * as argument to mbind() due to an old Linux bug (feature?) which
>>        * cuts off the last specified node. This means backend->host_nodes
>>        * must have MAX_NODES+1 bits available.
>>        */
>>
>> Which means that it's been known for a long time, and the workaround
>> seems to be pretty easy.
>>
>> So I wonder if we rather want to update the documentation to match reality.
> 
> [Sorry resending as text ... gmail insanity]
> 
> I think it is kind of weird if we ask to supply maxnodes+1 to work
> around the bug. If we apply this patch qemu would continue to work as
> is while fixing users that were not aware of that bug. So I would say
> applying this patch does more good. Long term qemu can drop its
> workaround or keep it for backward compatibility with old kernel.

Not really, unfortunately. The thing is that it requires a lot more 
effort to sense support than simply pass maxnodes+1. So unless you know 
exactly on which minimum kernel version your software runs (barely 
happens), you will simply apply the workaround.

I would assume that each and every sane user out there does that 
already, judging that even that QEMU code is 10 years old (!).

In any case, we have to document that behavior that existed since the 
very beginning. Because it would be even *worse* if someone would 
develop against a new kernel and would get a bunch of bug reports when 
running on literally every old kernel out there :)

So my best guess is that long-term it will create more issues when we 
change the behavior ... but in any case we have to update the man pages.
David Hildenbrand July 23, 2024, 6:24 p.m. UTC | #6
On 23.07.24 19:37, David Hildenbrand wrote:
> On 23.07.24 18:33, Jerome Glisse wrote:
>> On Mon, 22 Jul 2024 at 06:09, David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 20.07.24 19:35, Jerome Glisse wrote:
>>>> Because maxnode bug there is no way to bind or migrate_pages to the
>>>> last node in multi-node NUMA system unless you lie about maxnodes
>>>> when making the mbind, set_mempolicy or migrate_pages syscall.
>>>>
>>>> Manpage for those syscall describe maxnodes as the number of bits in
>>>> the node bitmap ("bit mask of nodes containing up to maxnode bits").
>>>> Thus if maxnode is n then we expect to have a n bit(s) bitmap which
>>>> means that the mask of valid bits is ((1 << n) - 1). The get_nodes()
>>>> decrement lead to the mask being ((1 << (n - 1)) - 1).
>>>>
>>>> The three syscalls use a common helper get_nodes() and first things
>>>> this helper do is decrement maxnode by 1 which leads to using n-1 bits
>>>> in the provided mask of nodes (see get_bitmap() an helper function to
>>>> get_nodes()).
>>>>
>>>> The lead to two bugs, either the last node in the bitmap provided will
>>>> not be use in either of the three syscalls, or the syscalls will error
>>>> out and return EINVAL if the only bit set in the bitmap was the last
>>>> bit in the mask of nodes (which is ignored because of the bug and an
>>>> empty mask of nodes is an invalid argument).
>>>>
>>>> I am surprised this bug was never caught ... it has been in the kernel
>>>> since forever.
>>>
>>> Let's look at QEMU: backends/hostmem.c
>>>
>>>        /*
>>>         * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
>>>         * as argument to mbind() due to an old Linux bug (feature?) which
>>>         * cuts off the last specified node. This means backend->host_nodes
>>>         * must have MAX_NODES+1 bits available.
>>>         */
>>>
>>> Which means that it's been known for a long time, and the workaround
>>> seems to be pretty easy.
>>>
>>> So I wonder if we rather want to update the documentation to match reality.
>>
>> [Sorry resending as text ... gmail insanity]
>>
>> I think it is kind of weird if we ask to supply maxnodes+1 to work
>> around the bug. If we apply this patch qemu would continue to work as
>> is while fixing users that were not aware of that bug. So I would say
>> applying this patch does more good. Long term qemu can drop its
>> workaround or keep it for backward compatibility with old kernel.
> 
> Not really, unfortunately. The thing is that it requires a lot more
> effort to sense support than simply pass maxnodes+1. So unless you know
> exactly on which minimum kernel version your software runs (barely
> happens), you will simply apply the workaround.
> 
> I would assume that each and every sane user out there does that
> already, judging that even that QEMU code is 10 years old (!).
> 
> In any case, we have to document that behavior that existed since the
> very beginning. Because it would be even *worse* if someone would
> develop against a new kernel and would get a bunch of bug reports when
> running on literally every old kernel out there :)
> 
> So my best guess is that long-term it will create more issues when we
> change the behavior ... but in any case we have to update the man pages.

I'll add a pointer to a discussion from 20 (!) year ago [1].

The conclusion [2] / request from Andi there was that N65 is the right 
thing to do, and we (re)added the --maxnode. This was when this code was 
introduced.

Assuming MAX_NODES is 128, we have to pass in 129 -- one more than the 
number of bits.

So the man page might be actively misleading (unless I misinterpret it): 
"nodemask points to a bit mask of nodes containing up to maxnode bits. 
The bit mask size is rounded to the next multiple of sizeof(unsigned 
long), but the kernel will use bits only up to maxnode."

That documentation should be fixed.

[1] https://lkml.indiana.edu/hypermail/linux/kernel/0409.1/1538.html
[2] https://lkml.indiana.edu/hypermail/linux/kernel/0409.1/1569.html
Jerome Glisse July 24, 2024, 4:15 a.m. UTC | #7
On Tue, 23 Jul 2024 at 10:37, David Hildenbrand <david@redhat.com> wrote:
>
> On 23.07.24 18:33, Jerome Glisse wrote:
> > On Mon, 22 Jul 2024 at 06:09, David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 20.07.24 19:35, Jerome Glisse wrote:
> >>> Because maxnode bug there is no way to bind or migrate_pages to the
> >>> last node in multi-node NUMA system unless you lie about maxnodes
> >>> when making the mbind, set_mempolicy or migrate_pages syscall.
> >>>
> >>> Manpage for those syscall describe maxnodes as the number of bits in
> >>> the node bitmap ("bit mask of nodes containing up to maxnode bits").
> >>> Thus if maxnode is n then we expect to have a n bit(s) bitmap which
> >>> means that the mask of valid bits is ((1 << n) - 1). The get_nodes()
> >>> decrement lead to the mask being ((1 << (n - 1)) - 1).
> >>>
> >>> The three syscalls use a common helper get_nodes() and first things
> >>> this helper do is decrement maxnode by 1 which leads to using n-1 bits
> >>> in the provided mask of nodes (see get_bitmap() an helper function to
> >>> get_nodes()).
> >>>
> >>> The lead to two bugs, either the last node in the bitmap provided will
> >>> not be use in either of the three syscalls, or the syscalls will error
> >>> out and return EINVAL if the only bit set in the bitmap was the last
> >>> bit in the mask of nodes (which is ignored because of the bug and an
> >>> empty mask of nodes is an invalid argument).
> >>>
> >>> I am surprised this bug was never caught ... it has been in the kernel
> >>> since forever.
> >>
> >> Let's look at QEMU: backends/hostmem.c
> >>
> >>       /*
> >>        * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
> >>        * as argument to mbind() due to an old Linux bug (feature?) which
> >>        * cuts off the last specified node. This means backend->host_nodes
> >>        * must have MAX_NODES+1 bits available.
> >>        */
> >>
> >> Which means that it's been known for a long time, and the workaround
> >> seems to be pretty easy.
> >>
> >> So I wonder if we rather want to update the documentation to match reality.
> >
> > [Sorry resending as text ... gmail insanity]
> >
> > I think it is kind of weird if we ask to supply maxnodes+1 to work
> > around the bug. If we apply this patch qemu would continue to work as
> > is while fixing users that were not aware of that bug. So I would say
> > applying this patch does more good. Long term qemu can drop its
> > workaround or keep it for backward compatibility with old kernel.
>
> Not really, unfortunately. The thing is that it requires a lot more
> effort to sense support than simply pass maxnodes+1. So unless you know
> exactly on which minimum kernel version your software runs (barely
> happens), you will simply apply the workaround.

The point I was trying to make is that working applications do not
need to change
their code; a patched or unpatched kernel will not change their behavior in any
way and thus they will continue working regardless of whether the kernel as
the patch.

While applications that are not as smart will keep miss-behaving until someone
fixes the application. So to me it looks like the patch brings good to
people without
arming any existing folks.

Fix in one place versus wait for people to fix their code ...


> I would assume that each and every sane user out there does that
> already, judging that even that QEMU code is 10 years old (!).

I took a look at some code I have access to and it is not the case
everywhere ...

>
> In any case, we have to document that behavior that existed since the
> very beginning. Because it would be even *worse* if someone would
> develop against a new kernel and would get a bunch of bug reports when
> running on literally every old kernel out there :)
>
> So my best guess is that long-term it will create more issues when we
> change the behavior ... but in any case we have to update the man pages.

No it would not, if you had the fix and did not modify applications
that are smart about it then nothing would change. Applications that
are smart will work the same on both patched and unpatched kernels
while applications that have the bug will suddenly have the behaviour
they would have expected from the documentation.

Thank you,
Jérôme
David Hildenbrand July 24, 2024, 6:27 a.m. UTC | #8
>> In any case, we have to document that behavior that existed since the
>> very beginning. Because it would be even *worse* if someone would
>> develop against a new kernel and would get a bunch of bug reports when
>> running on literally every old kernel out there :)
>>
>> So my best guess is that long-term it will create more issues when we
>> change the behavior ... but in any case we have to update the man pages.
> 
> No it would not, if you had the fix and did not modify applications
> that are smart about it then nothing would change. Applications that
> are smart will work the same on both patched and unpatched kernels
> while applications that have the bug will suddenly have the behaviour
> they would have expected from the documentation.

See my other mail, this seems to be the expected behavior since the very 
beginning.

So unless I am missing something important, the only thing to fix here 
is the documentation.
diff mbox series

Patch

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aec756ae5637..658e5366d266 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1434,7 +1434,6 @@  static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
-	--maxnode;
 	nodes_clear(*nodes);
 	if (maxnode == 0 || !nmask)
 		return 0;