diff mbox

[v2] mm, proc: Fix region lost in /proc/self/smaps

Message ID 1473649964-20191-1-git-send-email-guangrong.xiao@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong Sept. 12, 2016, 3:12 a.m. UTC
Recently, Redhat reported that nvml test suite failed on QEMU/KVM,
more detailed info please refer to:
   https://bugzilla.redhat.com/show_bug.cgi?id=1365721

Actually, this bug is not only for NVDIMM/DAX but also for any other file
systems. This simple test case abstracted from nvml can easily reproduce
this bug in common environment:

-------------------------- testcase.c -----------------------------
#define PROCMAXLEN 4096

int
is_pmem_proc(const void *addr, size_t len)
{
        const char *caddr = addr;

        FILE *fp;
        if ((fp = fopen("/proc/self/smaps", "r")) == NULL) {
                printf("!/proc/self/smaps");
                return 0;
        }

        int retval = 0;         /* assume false until proven otherwise */
        char line[PROCMAXLEN];  /* for fgets() */
        char *lo = NULL;        /* beginning of current range in smaps file */
        char *hi = NULL;        /* end of current range in smaps file */
        int needmm = 0;         /* looking for mm flag for current range */
        while (fgets(line, PROCMAXLEN, fp) != NULL) {
                static const char vmflags[] = "VmFlags:";
                static const char mm[] = " wr";

                /* check for range line */
                if (sscanf(line, "%p-%p", &lo, &hi) == 2) {
                        if (needmm) {
                                /* last range matched, but no mm flag found */
                                printf("never found mm flag.\n");
                                break;
                        } else if (caddr < lo) {
                                /* never found the range for caddr */
                                printf("#######no match for addr %p.\n", caddr);
                                break;
                        } else if (caddr < hi) {
                                /* start address is in this range */
                                size_t rangelen = (size_t)(hi - caddr);

                                /* remember that matching has started */
                                needmm = 1;

                                /* calculate remaining range to search for */
                                if (len > rangelen) {
                                        len -= rangelen;
                                        caddr += rangelen;
                                        printf("matched %zu bytes in range "
                                                "%p-%p, %zu left over.\n",
                                                        rangelen, lo, hi, len);
                                } else {
                                        len = 0;
                                        printf("matched all bytes in range "
                                                        "%p-%p.\n", lo, hi);
                                }
                        }
                } else if (needmm && strncmp(line, vmflags,
                                        sizeof(vmflags) - 1) == 0) {
                        if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) {
                                printf("mm flag found.\n");
                                if (len == 0) {
                                        /* entire range matched */
                                        retval = 1;
                                        break;
                                }
                                needmm = 0;     /* saw what was needed */
                        } else {
                                /* mm flag not set for some or all of range */
                                printf("range has no mm flag.\n");
                                break;
                        }
                }
        }

        fclose(fp);

        printf("returning %d.\n", retval);
        return retval;
}

#define NTHREAD 16

void *Addr;
size_t Size;

/*
 * worker -- the work each thread performs
 */
static void *
worker(void *arg)
{
        int *ret = (int *)arg;
        *ret =  is_pmem_proc(Addr, Size);
        return NULL;
}

int main(int argc, char *argv[])
{
        if (argc <  2 || argc > 3) {
                printf("usage: %s file [env].\n", argv[0]);
                return -1;
        }

        int fd = open(argv[1], O_RDWR);

        struct stat stbuf;
        fstat(fd, &stbuf);

        Size = stbuf.st_size;
        Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);

        close(fd);

        pthread_t threads[NTHREAD];
        int ret[NTHREAD];

        /* kick off NTHREAD threads */
        for (int i = 0; i < NTHREAD; i++)
                pthread_create(&threads[i], NULL, worker, &ret[i]);

        /* wait for all the threads to complete */
        for (int i = 0; i < NTHREAD; i++)
                pthread_join(threads[i], NULL);

        /* verify that all the threads return the same value */
        for (int i = 1; i < NTHREAD; i++) {
                if (ret[0] != ret[i]) {
                        printf("Error i %d ret[0] = %d ret[i] = %d.\n", i,
                                ret[0], ret[i]);
                }
        }

        printf("%d", ret[0]);
        return 0;
}

# dd if=/dev/zero of=~/out bs=2M count=1
# ./testcase ~/out

It failed as some threads can not find the memory region in
"/proc/self/smaps" which is allocated in the main process

It is caused by proc fs which uses 'file->version' to indicate the VMA that
is the last one has already been handled by read() system call. When the
next read() issues, it uses the 'version' to find the VMA, then the next
VMA is what we want to handle, the related code is as follows:

        if (last_addr) {
                vma = find_vma(mm, last_addr);
                if (vma && (vma = m_next_vma(priv, vma)))
                        return vma;
        }

However, VMA will be lost if the last VMA is gone, e.g:

The process VMA list is A->B->C->D

CPU 0                                  CPU 1
read() system call
   handle VMA B
   version = B
return to userspace

                                   unmap VMA B

issue read() again to continue to get
the region info
   find_vma(version) will get VMA C
   m_next_vma(C) will get VMA D
   handle D
   !!! VMA C is lost !!!

In order to fix this bug, we make 'file->version' indicate the end address
of current VMA

Changelog:
Thanks to Dave Hansen's comments, this version fixes the issue in v1 that
same virtual address range may be outputted twice, e.g:

Take two example VMAs:

	vma-A: (0x1000 -> 0x2000)
	vma-B: (0x2000 -> 0x3000)

read() #1: prints vma-A, sets m->version=0x2000

Now, merge A/B to make C:

	vma-C: (0x1000 -> 0x3000)

read() #2: find_vma(m->version=0x2000), returns vma-C, prints vma-C

The user will see two VMAs in their output:

	A: 0x1000->0x2000
	C: 0x1000->0x3000

Acked-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
---
 fs/proc/task_mmu.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

Comments

Michal Hocko Sept. 12, 2016, 12:54 p.m. UTC | #1
[Cc Oleg - he seemed to touch it the last. Keeping the whole email for
reference with a comment inline]

On Mon 12-09-16 11:12:44, Xiao Guangrong wrote:
> Recently, Redhat reported that nvml test suite failed on QEMU/KVM,
> more detailed info please refer to:
>    https://bugzilla.redhat.com/show_bug.cgi?id=1365721
> 
> Actually, this bug is not only for NVDIMM/DAX but also for any other file
> systems. This simple test case abstracted from nvml can easily reproduce
> this bug in common environment:
> 
> -------------------------- testcase.c -----------------------------
> #define PROCMAXLEN 4096
> 
> int
> is_pmem_proc(const void *addr, size_t len)
> {
>         const char *caddr = addr;
> 
>         FILE *fp;
>         if ((fp = fopen("/proc/self/smaps", "r")) == NULL) {
>                 printf("!/proc/self/smaps");
>                 return 0;
>         }
> 
>         int retval = 0;         /* assume false until proven otherwise */
>         char line[PROCMAXLEN];  /* for fgets() */
>         char *lo = NULL;        /* beginning of current range in smaps file */
>         char *hi = NULL;        /* end of current range in smaps file */
>         int needmm = 0;         /* looking for mm flag for current range */
>         while (fgets(line, PROCMAXLEN, fp) != NULL) {
>                 static const char vmflags[] = "VmFlags:";
>                 static const char mm[] = " wr";
> 
>                 /* check for range line */
>                 if (sscanf(line, "%p-%p", &lo, &hi) == 2) {
>                         if (needmm) {
>                                 /* last range matched, but no mm flag found */
>                                 printf("never found mm flag.\n");
>                                 break;
>                         } else if (caddr < lo) {
>                                 /* never found the range for caddr */
>                                 printf("#######no match for addr %p.\n", caddr);
>                                 break;
>                         } else if (caddr < hi) {
>                                 /* start address is in this range */
>                                 size_t rangelen = (size_t)(hi - caddr);
> 
>                                 /* remember that matching has started */
>                                 needmm = 1;
> 
>                                 /* calculate remaining range to search for */
>                                 if (len > rangelen) {
>                                         len -= rangelen;
>                                         caddr += rangelen;
>                                         printf("matched %zu bytes in range "
>                                                 "%p-%p, %zu left over.\n",
>                                                         rangelen, lo, hi, len);
>                                 } else {
>                                         len = 0;
>                                         printf("matched all bytes in range "
>                                                         "%p-%p.\n", lo, hi);
>                                 }
>                         }
>                 } else if (needmm && strncmp(line, vmflags,
>                                         sizeof(vmflags) - 1) == 0) {
>                         if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) {
>                                 printf("mm flag found.\n");
>                                 if (len == 0) {
>                                         /* entire range matched */
>                                         retval = 1;
>                                         break;
>                                 }
>                                 needmm = 0;     /* saw what was needed */
>                         } else {
>                                 /* mm flag not set for some or all of range */
>                                 printf("range has no mm flag.\n");
>                                 break;
>                         }
>                 }
>         }
> 
>         fclose(fp);
> 
>         printf("returning %d.\n", retval);
>         return retval;
> }
> 
> #define NTHREAD 16
> 
> void *Addr;
> size_t Size;
> 
> /*
>  * worker -- the work each thread performs
>  */
> static void *
> worker(void *arg)
> {
>         int *ret = (int *)arg;
>         *ret =  is_pmem_proc(Addr, Size);
>         return NULL;
> }
> 
> int main(int argc, char *argv[])
> {
>         if (argc <  2 || argc > 3) {
>                 printf("usage: %s file [env].\n", argv[0]);
>                 return -1;
>         }
> 
>         int fd = open(argv[1], O_RDWR);
> 
>         struct stat stbuf;
>         fstat(fd, &stbuf);
> 
>         Size = stbuf.st_size;
>         Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
> 
>         close(fd);
> 
>         pthread_t threads[NTHREAD];
>         int ret[NTHREAD];
> 
>         /* kick off NTHREAD threads */
>         for (int i = 0; i < NTHREAD; i++)
>                 pthread_create(&threads[i], NULL, worker, &ret[i]);
> 
>         /* wait for all the threads to complete */
>         for (int i = 0; i < NTHREAD; i++)
>                 pthread_join(threads[i], NULL);
> 
>         /* verify that all the threads return the same value */
>         for (int i = 1; i < NTHREAD; i++) {
>                 if (ret[0] != ret[i]) {
>                         printf("Error i %d ret[0] = %d ret[i] = %d.\n", i,
>                                 ret[0], ret[i]);
>                 }
>         }
> 
>         printf("%d", ret[0]);
>         return 0;
> }
> 
> # dd if=/dev/zero of=~/out bs=2M count=1
> # ./testcase ~/out
> 
> It failed as some threads can not find the memory region in
> "/proc/self/smaps" which is allocated in the main process
> 
> It is caused by proc fs which uses 'file->version' to indicate the VMA that
> is the last one has already been handled by read() system call. When the
> next read() issues, it uses the 'version' to find the VMA, then the next
> VMA is what we want to handle, the related code is as follows:
> 
>         if (last_addr) {
>                 vma = find_vma(mm, last_addr);
>                 if (vma && (vma = m_next_vma(priv, vma)))
>                         return vma;
>         }
> 
> However, VMA will be lost if the last VMA is gone, e.g:
> 
> The process VMA list is A->B->C->D
> 
> CPU 0                                  CPU 1
> read() system call
>    handle VMA B
>    version = B
> return to userspace
> 
>                                    unmap VMA B
> 
> issue read() again to continue to get
> the region info
>    find_vma(version) will get VMA C
>    m_next_vma(C) will get VMA D
>    handle D
>    !!! VMA C is lost !!!
> 
> In order to fix this bug, we make 'file->version' indicate the end address
> of current VMA

Doesn't this open doors to another weird cases. Say B would be partially
unmapped (tail of the VMA would get unmapped and reused for a new VMA.

I am not sure we provide any guarantee when there are more read
syscalls. Hmm, even with a single read() we can get inconsistent results
from different threads without any user space synchronization.

So in other words isn't this fixing a bug by introducing a slightly
different one while we are not really guaranteeing anything strong here?

> Changelog:
> Thanks to Dave Hansen's comments, this version fixes the issue in v1 that
> same virtual address range may be outputted twice, e.g:
> 
> Take two example VMAs:
> 
> 	vma-A: (0x1000 -> 0x2000)
> 	vma-B: (0x2000 -> 0x3000)
> 
> read() #1: prints vma-A, sets m->version=0x2000
> 
> Now, merge A/B to make C:
> 
> 	vma-C: (0x1000 -> 0x3000)
> 
> read() #2: find_vma(m->version=0x2000), returns vma-C, prints vma-C
> 
> The user will see two VMAs in their output:
> 
> 	A: 0x1000->0x2000
> 	C: 0x1000->0x3000
> 
> Acked-by: Dave Hansen <dave.hansen@intel.com>
> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> ---
>  fs/proc/task_mmu.c | 15 ++++++++++-----
>  1 file changed, 10 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 187d84e..10ca648 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -147,7 +147,7 @@ m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
>  static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
>  {
>  	if (m->count < m->size)	/* vma is copied successfully */
> -		m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
> +		m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
>  }
>  
>  static void *m_start(struct seq_file *m, loff_t *ppos)
> @@ -176,14 +176,14 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
>  
>  	if (last_addr) {
>  		vma = find_vma(mm, last_addr);
> -		if (vma && (vma = m_next_vma(priv, vma)))
> +		if (vma)
>  			return vma;
>  	}
>  
>  	m->version = 0;
>  	if (pos < mm->map_count) {
>  		for (vma = mm->mmap; pos; pos--) {
> -			m->version = vma->vm_start;
> +			m->version = vma->vm_end;
>  			vma = vma->vm_next;
>  		}
>  		return vma;
> @@ -293,7 +293,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
>  	vm_flags_t flags = vma->vm_flags;
>  	unsigned long ino = 0;
>  	unsigned long long pgoff = 0;
> -	unsigned long start, end;
> +	unsigned long end, start = m->version;
>  	dev_t dev = 0;
>  	const char *name = NULL;
>  
> @@ -304,8 +304,13 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
>  		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
>  	}
>  
> +	/*
> +	 * the region [0, m->version) has already been handled, do not
> +	 * handle it doubly.
> +	 */
> +	start = max(vma->vm_start, start);
> +
>  	/* We don't show the stack guard page in /proc/maps */
> -	start = vma->vm_start;
>  	if (stack_guard_page_start(vma, start))
>  		start += PAGE_SIZE;
>  	end = vma->vm_end;
> -- 
> 1.8.3.1
>
Dave Hansen Sept. 12, 2016, 3:01 p.m. UTC | #2
On 09/12/2016 05:54 AM, Michal Hocko wrote:
>> > In order to fix this bug, we make 'file->version' indicate the end address
>> > of current VMA
> Doesn't this open doors to another weird cases. Say B would be partially
> unmapped (tail of the VMA would get unmapped and reused for a new VMA.

In the end, this interface isn't about VMAs.  It's about addresses, and
we need to make sure that the _addresses_ coming out of it are sane.  In
the case that a VMA was partially unmapped, it doesn't make sense to
show the "new" VMA because we already had some output covering the
address of the "new" VMA from the old one.

> I am not sure we provide any guarantee when there are more read
> syscalls. Hmm, even with a single read() we can get inconsistent results
> from different threads without any user space synchronization.

Yeah, very true.  But, I think we _can_ at least provide the following
guarantees (among others):
1. addresses don't go backwards
2. If there is something at a given vaddr during the entirety of the
   life of the smaps walk, we will produce some output for it.

> So in other words isn't this fixing a bug by introducing a slightly
> different one while we are not really guaranteeing anything strong here?

Well, the (original) bug here _is_ pretty crummy.  It's not printing a
VMA, and that VMA was never touched.  It's just collateral damage from
the previous guy who got destroyed.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michal Hocko Sept. 12, 2016, 7:10 p.m. UTC | #3
On Mon 12-09-16 08:01:06, Dave Hansen wrote:
> On 09/12/2016 05:54 AM, Michal Hocko wrote:
> >> > In order to fix this bug, we make 'file->version' indicate the end address
> >> > of current VMA
> > Doesn't this open doors to another weird cases. Say B would be partially
> > unmapped (tail of the VMA would get unmapped and reused for a new VMA.
> 
> In the end, this interface isn't about VMAs.  It's about addresses, and
> we need to make sure that the _addresses_ coming out of it are sane.  In
> the case that a VMA was partially unmapped, it doesn't make sense to
> show the "new" VMA because we already had some output covering the
> address of the "new" VMA from the old one.

OK, that is a fair point and it speaks for caching the vm_end rather
than vm_start+skip.

> > I am not sure we provide any guarantee when there are more read
> > syscalls. Hmm, even with a single read() we can get inconsistent results
> > from different threads without any user space synchronization.
> 
> Yeah, very true.  But, I think we _can_ at least provide the following
> guarantees (among others):
> 1. addresses don't go backwards
> 2. If there is something at a given vaddr during the entirety of the
>    life of the smaps walk, we will produce some output for it.

I guess we also want 
  3. no overlaps with previously printed values (assuming two subsequent
     reads without seek).

the patch tries to achieve the last part as well AFAICS but I guess this
is incomplete because at least /proc/<pid>/smaps will report counters
for the full vma range while the header (aka show_map_vma) will report
shorter (non-overlapping) range. I haven't checked other files which use
m_{start,next}

Considering how this all can be tricky and how partial reads can be
confusing and even misleading I am really wondering whether we
should simply document that only full reads will provide a sensible
results.
Xiao Guangrong Sept. 13, 2016, 3:01 a.m. UTC | #4
On 09/13/2016 03:10 AM, Michal Hocko wrote:
> On Mon 12-09-16 08:01:06, Dave Hansen wrote:
>> On 09/12/2016 05:54 AM, Michal Hocko wrote:
>>>>> In order to fix this bug, we make 'file->version' indicate the end address
>>>>> of current VMA
>>> Doesn't this open doors to another weird cases. Say B would be partially
>>> unmapped (tail of the VMA would get unmapped and reused for a new VMA.
>>
>> In the end, this interface isn't about VMAs.  It's about addresses, and
>> we need to make sure that the _addresses_ coming out of it are sane.  In
>> the case that a VMA was partially unmapped, it doesn't make sense to
>> show the "new" VMA because we already had some output covering the
>> address of the "new" VMA from the old one.
>
> OK, that is a fair point and it speaks for caching the vm_end rather
> than vm_start+skip.
>
>>> I am not sure we provide any guarantee when there are more read
>>> syscalls. Hmm, even with a single read() we can get inconsistent results
>>> from different threads without any user space synchronization.
>>
>> Yeah, very true.  But, I think we _can_ at least provide the following
>> guarantees (among others):
>> 1. addresses don't go backwards
>> 2. If there is something at a given vaddr during the entirety of the
>>    life of the smaps walk, we will produce some output for it.
>
> I guess we also want
>   3. no overlaps with previously printed values (assuming two subsequent
>      reads without seek).
>
> the patch tries to achieve the last part as well AFAICS but I guess this
> is incomplete because at least /proc/<pid>/smaps will report counters
> for the full vma range while the header (aka show_map_vma) will report
> shorter (non-overlapping) range. I haven't checked other files which use
> m_{start,next}

You are right. Will fix both /proc/PID/smaps and /proc/PID/maps in
the next version.

>
> Considering how this all can be tricky and how partial reads can be
> confusing and even misleading I am really wondering whether we
> should simply document that only full reads will provide a sensible
> results.

Make sense. Will document the guarantee in
Documentation/filesystems/proc.txt

Thank you, Dave and Michal, for figuring out the right direction. :)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Oleg Nesterov Sept. 13, 2016, 2:59 p.m. UTC | #5
On 09/12, Michal Hocko wrote:
>
> Considering how this all can be tricky and how partial reads can be
> confusing and even misleading I am really wondering whether we
> should simply document that only full reads will provide a sensible
> results.

I agree. I don't even understand why this was considered as a bug.
Obviously, m_stop() which drops mmap_sep should not be called, or
all the threads should be stopped, if you want to trust the result.

Although all I can recall about this code is that it needs more cleanups.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Hansen Sept. 13, 2016, 4:21 p.m. UTC | #6
On 09/13/2016 07:59 AM, Oleg Nesterov wrote:
> On 09/12, Michal Hocko wrote:
>> > Considering how this all can be tricky and how partial reads can be
>> > confusing and even misleading I am really wondering whether we
>> > should simply document that only full reads will provide a sensible
>> > results.
> I agree. I don't even understand why this was considered as a bug.
> Obviously, m_stop() which drops mmap_sep should not be called, or
> all the threads should be stopped, if you want to trust the result.

There was a mapping at a given address.  That mapping did not change, it
was not split, its attributes did not change.  But, it didn't show up
when reading smaps.  Folks _actually_ noticed this in a test suite
looking for that address range in smaps.

IOW, we had goofy kernel behavior, and it broke a reasonable test
program.  The test program just used fgets() to read into a fixed-length
buffer, which is a completely normal thing to do.

To get "sensible results", doesn't userspace have to somehow know in
advance how many bytes of data a given VMA will generate in smaps output?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Oleg Nesterov Sept. 14, 2016, 3:38 p.m. UTC | #7
On 09/13, Dave Hansen wrote:
>
> On 09/13/2016 07:59 AM, Oleg Nesterov wrote:
> > I agree. I don't even understand why this was considered as a bug.
> > Obviously, m_stop() which drops mmap_sep should not be called, or
> > all the threads should be stopped, if you want to trust the result.
>
> There was a mapping at a given address.  That mapping did not change, it
> was not split, its attributes did not change.  But, it didn't show up
> when reading smaps.  Folks _actually_ noticed this in a test suite
> looking for that address range in smaps.

I understand, and I won't argue with any change which makes the things
better. Just I do not think this is a real problem. And this patch can't
fix other oddities and it seems it adds another one (at least) although
I can easily misread this patch and/or the code.

So we change m_cache_vma(),

	-        m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
	+        m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;

OK, and another change in m_start()

	-        if (vma && (vma = m_next_vma(priv, vma)))
	+        if (vma)

means that it can return the same vma if it grows in between.

show_map_vma() has another change

	+       start = max(vma->vm_start, start);

so it will be reported as _another_ vma, and this doesn't look exactly
right.

And after that *ppos will be falsely incremented... but probably this
doesn't matter because the "if (pos < mm->map_count)" logic in m_start()
looks broken anyway.

> IOW, we had goofy kernel behavior, and it broke a reasonable test
> program.  The test program just used fgets() to read into a fixed-length
> buffer, which is a completely normal thing to do.
>
> To get "sensible results", doesn't userspace have to somehow know in
> advance how many bytes of data a given VMA will generate in smaps output?

Yes, /proc/has its limitations ;)

Even if you read, say, /proc/pid/status you can get the corrupted result
after the short read. But in this case fgets() should likely work, yes.


Dave, let me repeat, I won't argue with any change and in any case you
can safely ignore my opinion.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Sept. 19, 2016, 7:21 a.m. UTC | #8
On 09/14/2016 11:38 PM, Oleg Nesterov wrote:
> On 09/13, Dave Hansen wrote:
>>
>> On 09/13/2016 07:59 AM, Oleg Nesterov wrote:
>>> I agree. I don't even understand why this was considered as a bug.
>>> Obviously, m_stop() which drops mmap_sep should not be called, or
>>> all the threads should be stopped, if you want to trust the result.
>>
>> There was a mapping at a given address.  That mapping did not change, it
>> was not split, its attributes did not change.  But, it didn't show up
>> when reading smaps.  Folks _actually_ noticed this in a test suite
>> looking for that address range in smaps.
>
> I understand, and I won't argue with any change which makes the things
> better. Just I do not think this is a real problem. And this patch can't
> fix other oddities and it seems it adds another one (at least) although
> I can easily misread this patch and/or the code.
>
> So we change m_cache_vma(),
>
> 	-        m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
> 	+        m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
>
> OK, and another change in m_start()
>
> 	-        if (vma && (vma = m_next_vma(priv, vma)))
> 	+        if (vma)
>
> means that it can return the same vma if it grows in between.
>
> show_map_vma() has another change
>
> 	+       start = max(vma->vm_start, start);
>
> so it will be reported as _another_ vma, and this doesn't look exactly
> right.

We noticed it in the discussion of v1, however it is not bad as Dave said
it is about 'address range' rather that vma.

>
> And after that *ppos will be falsely incremented... but probably this
> doesn't matter because the "if (pos < mm->map_count)" logic in m_start()
> looks broken anyway.

The 'broken' can happen only if it is not the first read and m->version is
zero (*ppos != 0 && m->version == 0). If i understand the code correctly,
only m->buffer overflowed can trigger this, for smaps, each vma only
uses ~1k memory that means this could not happen. Right?




--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187d84e..10ca648 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -147,7 +147,7 @@  m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
 static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
 	if (m->count < m->size)	/* vma is copied successfully */
-		m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
+		m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
 }
 
 static void *m_start(struct seq_file *m, loff_t *ppos)
@@ -176,14 +176,14 @@  static void *m_start(struct seq_file *m, loff_t *ppos)
 
 	if (last_addr) {
 		vma = find_vma(mm, last_addr);
-		if (vma && (vma = m_next_vma(priv, vma)))
+		if (vma)
 			return vma;
 	}
 
 	m->version = 0;
 	if (pos < mm->map_count) {
 		for (vma = mm->mmap; pos; pos--) {
-			m->version = vma->vm_start;
+			m->version = vma->vm_end;
 			vma = vma->vm_next;
 		}
 		return vma;
@@ -293,7 +293,7 @@  show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	vm_flags_t flags = vma->vm_flags;
 	unsigned long ino = 0;
 	unsigned long long pgoff = 0;
-	unsigned long start, end;
+	unsigned long end, start = m->version;
 	dev_t dev = 0;
 	const char *name = NULL;
 
@@ -304,8 +304,13 @@  show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
 	}
 
+	/*
+	 * the region [0, m->version) has already been handled, do not
+	 * handle it doubly.
+	 */
+	start = max(vma->vm_start, start);
+
 	/* We don't show the stack guard page in /proc/maps */
-	start = vma->vm_start;
 	if (stack_guard_page_start(vma, start))
 		start += PAGE_SIZE;
 	end = vma->vm_end;