diff mbox

Fix region lost in /proc/self/smaps

Message ID 1473231111-38058-1-git-send-email-guangrong.xiao@linux.intel.com
State New, archived
Headers show

Commit Message

Xiao Guangrong Sept. 7, 2016, 6:51 a.m. UTC
Recently, Redhat reported that nvml test suite failed on QEMU/KVM,
more detailed info please refer to:
   https://bugzilla.redhat.com/show_bug.cgi?id=1365721

Actually, this bug is not only for NVDIMM/DAX but also for any other file
systems. This simple test case abstracted from nvml can easily reproduce
this bug in common environment:

-------------------------- testcase.c -----------------------------
#define PROCMAXLEN 4096

int
is_pmem_proc(const void *addr, size_t len)
{
	const char *caddr = addr;

	FILE *fp;
	if ((fp = fopen("/proc/self/smaps", "r")) == NULL) {
		printf("!/proc/self/smaps");
		return 0;
	}

	int retval = 0;		/* assume false until proven otherwise */
	char line[PROCMAXLEN];	/* for fgets() */
	char *lo = NULL;	/* beginning of current range in smaps file */
	char *hi = NULL;	/* end of current range in smaps file */
	int needmm = 0;		/* looking for mm flag for current range */
	while (fgets(line, PROCMAXLEN, fp) != NULL) {
		static const char vmflags[] = "VmFlags:";
		static const char mm[] = " wr";

		/* check for range line */
		if (sscanf(line, "%p-%p", &lo, &hi) == 2) {
			if (needmm) {
				/* last range matched, but no mm flag found */
				printf("never found mm flag.\n");
				break;
			} else if (caddr < lo) {
				/* never found the range for caddr */
				printf("#######no match for addr %p.\n", caddr);
				break;
			} else if (caddr < hi) {
				/* start address is in this range */
				size_t rangelen = (size_t)(hi - caddr);

				/* remember that matching has started */
				needmm = 1;

				/* calculate remaining range to search for */
				if (len > rangelen) {
					len -= rangelen;
					caddr += rangelen;
					printf("matched %zu bytes in range "
						"%p-%p, %zu left over.\n",
							rangelen, lo, hi, len);
				} else {
					len = 0;
					printf("matched all bytes in range "
							"%p-%p.\n", lo, hi);
				}
			}
		} else if (needmm && strncmp(line, vmflags,
					sizeof(vmflags) - 1) == 0) {
			if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) {
				printf("mm flag found.\n");
				if (len == 0) {
					/* entire range matched */
					retval = 1;
					break;
				}
				needmm = 0;	/* saw what was needed */
			} else {
				/* mm flag not set for some or all of range */
				printf("range has no mm flag.\n");
				break;
			}
		}
	}

	fclose(fp);

	printf("returning %d.\n", retval);
	return retval;
}

#define NTHREAD 16

void *Addr;
size_t Size;

/*
 * worker -- the work each thread performs
 */
static void *
worker(void *arg)
{
	int *ret = (int *)arg;
	*ret =  is_pmem_proc(Addr, Size);
	return NULL;
}

int main(int argc, char *argv[])
{
	if (argc <  2 || argc > 3) {
		printf("usage: %s file [env].\n", argv[0]);
		return -1;
	}

	int fd = open(argv[1], O_RDWR);

	struct stat stbuf;
	fstat(fd, &stbuf);

	Size = stbuf.st_size;
	Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);

	close(fd);

	pthread_t threads[NTHREAD];
	int ret[NTHREAD];

	/* kick off NTHREAD threads */
	for (int i = 0; i < NTHREAD; i++)
		pthread_create(&threads[i], NULL, worker, &ret[i]);

	/* wait for all the threads to complete */
	for (int i = 0; i < NTHREAD; i++)
		pthread_join(threads[i], NULL);

	/* verify that all the threads return the same value */
	for (int i = 1; i < NTHREAD; i++) {
		if (ret[0] != ret[i]) {
			printf("Error i %d ret[0] = %d ret[i] = %d.\n", i,
				ret[0], ret[i]);
		}
	}

	printf("%d", ret[0]);
	return 0;
}

# dd if=/dev/zero of=~/out bs=2M count=1
# ./testcase ~/out

It failed as some threads can not find the memory region in
"/proc/self/smaps" which is allocated in the mail process

It is caused by proc fs which uses 'file->version' to indicate the VMA that
is the last one has already been handled by read() system call. When the
next read() issues, it uses the 'version' to find the VMA, then the next
VMA is what we want to handle, the related code is as follows:

        if (last_addr) {
                vma = find_vma(mm, last_addr);
                if (vma && (vma = m_next_vma(priv, vma)))
                        return vma;
        }

However, VMA will be lost if the last VMA is gone, eg:

The process VMA list is A->B->C->D

CPU 0                                  CPU 1
read() system call
   handle VMA B
   version = B
return to userspace

                                   unmap VMA B

issue read() again to continue to get
the region info
   find_vma(version) will get VMA C
   m_next_vma(C) will get VMA D
   handle D
   !!! VMA C is lost !!!

In order to fix this bug, we make 'file->version' indicate the next VMA
we want to handle

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
---
 fs/proc/task_mmu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

Comments

Xiao Guangrong Sept. 7, 2016, 7:05 a.m. UTC | #1
Sorry, the title should be [PATCH] mm, proc: Fix region lost in /proc/self/smaps

On 09/07/2016 02:51 PM, Xiao Guangrong wrote:
> Recently, Redhat reported that nvml test suite failed on QEMU/KVM,
> more detailed info please refer to:
>    https://bugzilla.redhat.com/show_bug.cgi?id=1365721
>
> Actually, this bug is not only for NVDIMM/DAX but also for any other file
> systems. This simple test case abstracted from nvml can easily reproduce
> this bug in common environment:
>
> -------------------------- testcase.c -----------------------------
> #define PROCMAXLEN 4096
>
> int
> is_pmem_proc(const void *addr, size_t len)
> {
> 	const char *caddr = addr;
>
> 	FILE *fp;
> 	if ((fp = fopen("/proc/self/smaps", "r")) == NULL) {
> 		printf("!/proc/self/smaps");
> 		return 0;
> 	}
>
> 	int retval = 0;		/* assume false until proven otherwise */
> 	char line[PROCMAXLEN];	/* for fgets() */
> 	char *lo = NULL;	/* beginning of current range in smaps file */
> 	char *hi = NULL;	/* end of current range in smaps file */
> 	int needmm = 0;		/* looking for mm flag for current range */
> 	while (fgets(line, PROCMAXLEN, fp) != NULL) {
> 		static const char vmflags[] = "VmFlags:";
> 		static const char mm[] = " wr";
>
> 		/* check for range line */
> 		if (sscanf(line, "%p-%p", &lo, &hi) == 2) {
> 			if (needmm) {
> 				/* last range matched, but no mm flag found */
> 				printf("never found mm flag.\n");
> 				break;
> 			} else if (caddr < lo) {
> 				/* never found the range for caddr */
> 				printf("#######no match for addr %p.\n", caddr);
> 				break;
> 			} else if (caddr < hi) {
> 				/* start address is in this range */
> 				size_t rangelen = (size_t)(hi - caddr);
>
> 				/* remember that matching has started */
> 				needmm = 1;
>
> 				/* calculate remaining range to search for */
> 				if (len > rangelen) {
> 					len -= rangelen;
> 					caddr += rangelen;
> 					printf("matched %zu bytes in range "
> 						"%p-%p, %zu left over.\n",
> 							rangelen, lo, hi, len);
> 				} else {
> 					len = 0;
> 					printf("matched all bytes in range "
> 							"%p-%p.\n", lo, hi);
> 				}
> 			}
> 		} else if (needmm && strncmp(line, vmflags,
> 					sizeof(vmflags) - 1) == 0) {
> 			if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) {
> 				printf("mm flag found.\n");
> 				if (len == 0) {
> 					/* entire range matched */
> 					retval = 1;
> 					break;
> 				}
> 				needmm = 0;	/* saw what was needed */
> 			} else {
> 				/* mm flag not set for some or all of range */
> 				printf("range has no mm flag.\n");
> 				break;
> 			}
> 		}
> 	}
>
> 	fclose(fp);
>
> 	printf("returning %d.\n", retval);
> 	return retval;
> }
>
> #define NTHREAD 16
>
> void *Addr;
> size_t Size;
>
> /*
>  * worker -- the work each thread performs
>  */
> static void *
> worker(void *arg)
> {
> 	int *ret = (int *)arg;
> 	*ret =  is_pmem_proc(Addr, Size);
> 	return NULL;
> }
>
> int main(int argc, char *argv[])
> {
> 	if (argc <  2 || argc > 3) {
> 		printf("usage: %s file [env].\n", argv[0]);
> 		return -1;
> 	}
>
> 	int fd = open(argv[1], O_RDWR);
>
> 	struct stat stbuf;
> 	fstat(fd, &stbuf);
>
> 	Size = stbuf.st_size;
> 	Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
>
> 	close(fd);
>
> 	pthread_t threads[NTHREAD];
> 	int ret[NTHREAD];
>
> 	/* kick off NTHREAD threads */
> 	for (int i = 0; i < NTHREAD; i++)
> 		pthread_create(&threads[i], NULL, worker, &ret[i]);
>
> 	/* wait for all the threads to complete */
> 	for (int i = 0; i < NTHREAD; i++)
> 		pthread_join(threads[i], NULL);
>
> 	/* verify that all the threads return the same value */
> 	for (int i = 1; i < NTHREAD; i++) {
> 		if (ret[0] != ret[i]) {
> 			printf("Error i %d ret[0] = %d ret[i] = %d.\n", i,
> 				ret[0], ret[i]);
> 		}
> 	}
>
> 	printf("%d", ret[0]);
> 	return 0;
> }
>
> # dd if=/dev/zero of=~/out bs=2M count=1
> # ./testcase ~/out
>
> It failed as some threads can not find the memory region in
> "/proc/self/smaps" which is allocated in the mail process
>
> It is caused by proc fs which uses 'file->version' to indicate the VMA that
> is the last one has already been handled by read() system call. When the
> next read() issues, it uses the 'version' to find the VMA, then the next
> VMA is what we want to handle, the related code is as follows:
>
>         if (last_addr) {
>                 vma = find_vma(mm, last_addr);
>                 if (vma && (vma = m_next_vma(priv, vma)))
>                         return vma;
>         }
>
> However, VMA will be lost if the last VMA is gone, eg:
>
> The process VMA list is A->B->C->D
>
> CPU 0                                  CPU 1
> read() system call
>    handle VMA B
>    version = B
> return to userspace
>
>                                    unmap VMA B
>
> issue read() again to continue to get
> the region info
>    find_vma(version) will get VMA C
>    m_next_vma(C) will get VMA D
>    handle D
>    !!! VMA C is lost !!!
>
> In order to fix this bug, we make 'file->version' indicate the next VMA
> we want to handle
>
> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> ---
>  fs/proc/task_mmu.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 187d84e..ace4a69 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -146,8 +146,12 @@ m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
>
>  static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
>  {
> -	if (m->count < m->size)	/* vma is copied successfully */
> -		m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
> +	/* vma is copied successfully */
> +	if (m->count < m->size) {
> +		struct vm_area_struct *vma_next =  m_next_vma(m->private, vma);
> +
> +		m->version = vma_next ? vma_next->vm_start : -1UL;
> +	}
>  }
>
>  static void *m_start(struct seq_file *m, loff_t *ppos)
> @@ -176,15 +180,15 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
>
>  	if (last_addr) {
>  		vma = find_vma(mm, last_addr);
> -		if (vma && (vma = m_next_vma(priv, vma)))
> +		if (vma)
>  			return vma;
>  	}
>
>  	m->version = 0;
>  	if (pos < mm->map_count) {
>  		for (vma = mm->mmap; pos; pos--) {
> -			m->version = vma->vm_start;
>  			vma = vma->vm_next;
> +			m->version = vma->vm_start;
>  		}
>  		return vma;
>  	}
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Hansen Sept. 7, 2016, 4:34 p.m. UTC | #2
On 09/06/2016 11:51 PM, Xiao Guangrong wrote:
> In order to fix this bug, we make 'file->version' indicate the next VMA
> we want to handle

This new approach makes it more likely that we'll skip a new VMA that
gets inserted in between the read()s.  But, I guess that's OK.  We don't
exactly claim to be giving super up-to-date data at the time of read().

With the old code, was there also a case that we could print out the
same virtual address range more than once?  It seems like that could
happen if we had a VMA split between two reads.

I think this introduces one oddity: if you have a VMA merge between two
reads(), you might get the same virtual address range twice in your
output.  This didn't happen before because we would have just skipped
over the area that got merged.

Take two example VMAs:

	vma-A: (0x1000 -> 0x2000)
	vma-B: (0x2000 -> 0x3000)

read() #1: prints vma-A, sets m->version=0x2000

Now, merge A/B to make C:

	vma-C: (0x1000 -> 0x3000)

read() #2: find_vma(m->version=0x2000), returns vma-C, prints vma-C

The user will see two VMAs in their output:

	A: 0x1000->0x2000
	C: 0x1000->0x3000

Will it confuse them to see the same virtual address range twice?  Or is
there something preventing that happening that I'm missing?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Sept. 8, 2016, 3:36 a.m. UTC | #3
On 09/08/2016 12:34 AM, Dave Hansen wrote:
> On 09/06/2016 11:51 PM, Xiao Guangrong wrote:
>> In order to fix this bug, we make 'file->version' indicate the next VMA
>> we want to handle
>
> This new approach makes it more likely that we'll skip a new VMA that
> gets inserted in between the read()s.  But, I guess that's OK.  We don't
> exactly claim to be giving super up-to-date data at the time of read().

Yes, I completely agree with you. :)

>
> With the old code, was there also a case that we could print out the
> same virtual address range more than once?  It seems like that could
> happen if we had a VMA split between two reads.

Yes.

>
> I think this introduces one oddity: if you have a VMA merge between two
> reads(), you might get the same virtual address range twice in your
> output.  This didn't happen before because we would have just skipped
> over the area that got merged.
>
> Take two example VMAs:
>
> 	vma-A: (0x1000 -> 0x2000)
> 	vma-B: (0x2000 -> 0x3000)
>
> read() #1: prints vma-A, sets m->version=0x2000
>
> Now, merge A/B to make C:
>
> 	vma-C: (0x1000 -> 0x3000)
>
> read() #2: find_vma(m->version=0x2000), returns vma-C, prints vma-C
>
> The user will see two VMAs in their output:
>
> 	A: 0x1000->0x2000
> 	C: 0x1000->0x3000
>
> Will it confuse them to see the same virtual address range twice?  Or is
> there something preventing that happening that I'm missing?
>

You are right. Nothing can prevent it.

However, it is not easy to handle the case that the new VMA overlays with the old VMA
already got by userspace. I think we have some choices:
1: One way is completely skipping the new VMA region as current kernel code does but i
    do not think this is good as the later VMAs will be dropped.

2: show the un-overlayed portion of new VMA. In your case, we just show the region
    (0x2000 -> 0x3000), however, it can not work well if the VMA is a new created
    region with different attributions.

3: completely show the new VMA as this patch does.

Which one do you prefer?

Thanks!

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Hansen Sept. 8, 2016, 2:05 p.m. UTC | #4
On 09/07/2016 08:36 PM, Xiao Guangrong wrote:>> The user will see two
VMAs in their output:
>>
>>     A: 0x1000->0x2000
>>     C: 0x1000->0x3000
>>
>> Will it confuse them to see the same virtual address range twice?  Or is
>> there something preventing that happening that I'm missing?
>>
> 
> You are right. Nothing can prevent it.
> 
> However, it is not easy to handle the case that the new VMA overlays
> with the old VMA
> already got by userspace. I think we have some choices:
> 1: One way is completely skipping the new VMA region as current kernel
> code does but i
>    do not think this is good as the later VMAs will be dropped.
> 
> 2: show the un-overlayed portion of new VMA. In your case, we just show
> the region
>    (0x2000 -> 0x3000), however, it can not work well if the VMA is a new
> created
>    region with different attributions.
> 
> 3: completely show the new VMA as this patch does.
> 
> Which one do you prefer?

I'd be willing to bet that #3 will break *somebody's* tooling.
Addresses going backwards is certainly screwy.  Imagine somebody using
smaps to search for address holes and doing hole_size=0x1000-0x2000.

#1 can lies about there being no mapping in place where there there may
have _always_ been a mapping and is very similar to the bug you were
originally fixing.  I think that throws it out.

#2 is our best bet, I think.  It's unfortunately also the most code.
It's also a bit of a fib because it'll show a mapping that never
actually existed, but I think this is OK.  I'm not sure what the
downside is that you're referring to, though.  Can you explain?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187d84e..ace4a69 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -146,8 +146,12 @@  m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
 
 static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
-	if (m->count < m->size)	/* vma is copied successfully */
-		m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
+	/* vma is copied successfully */
+	if (m->count < m->size) {
+		struct vm_area_struct *vma_next =  m_next_vma(m->private, vma);
+
+		m->version = vma_next ? vma_next->vm_start : -1UL;
+	}
 }
 
 static void *m_start(struct seq_file *m, loff_t *ppos)
@@ -176,15 +180,15 @@  static void *m_start(struct seq_file *m, loff_t *ppos)
 
 	if (last_addr) {
 		vma = find_vma(mm, last_addr);
-		if (vma && (vma = m_next_vma(priv, vma)))
+		if (vma)
 			return vma;
 	}
 
 	m->version = 0;
 	if (pos < mm->map_count) {
 		for (vma = mm->mmap; pos; pos--) {
-			m->version = vma->vm_start;
 			vma = vma->vm_next;
+			m->version = vma->vm_start;
 		}
 		return vma;
 	}