diff mbox series

[v14,12/22] mm: pagewalk: Allow walking without vma

Message ID 20191028135910.33253-13-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series Generic page walk and ptdump | expand

Commit Message

Steven Price Oct. 28, 2019, 1:59 p.m. UTC
Since 48684a65b4e3: "mm: pagewalk: fix misbehavior of walk_page_range
for vma(VM_PFNMAP)", page_table_walk() will report any kernel area as
a hole, because it lacks a vma.

This means each arch has re-implemented page table walking when needed,
for example in the per-arch ptdump walker.

Remove the requirement to have a vma except when trying to split huge
pages.

Tested-by: Zong Li <zong.li@sifive.com>
Signed-off-by: Steven Price <steven.price@arm.com>
---
 mm/pagewalk.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

Comments

Steven Price Oct. 31, 2019, 5:22 p.m. UTC | #1
On Thu, Oct 31, 2019 at 03:15:10PM +0000, kernel test robot wrote:
> FYI, we noticed the following commit (built with gcc-7):
> 
> commit: 9343f6818bb98cf0c982bfff6ed89b2c7176bcf9 ("[PATCH v14 12/22] mm: pagewalk: Allow walking without vma")
> url: https://github.com/0day-ci/linux/commits/Steven-Price/Generic-page-walk-and-ptdump/20191030-085205
> 
[...]
> 
> [   36.010874] BUG: kernel NULL pointer dereference, address: 0000000000000053
> [   36.012644] #PF: supervisor read access in kernel mode
> [   36.014074] #PF: error_code(0x0000) - not-present page
> [   36.015481] PGD 0 P4D 0 
> [   36.016433] Oops: 0000 [#1] SMP PTI
> [   36.017561] CPU: 1 PID: 2376 Comm: mmap12 Not tainted 5.4.0-rc5-00046-g9343f6818bb98 #1
> [   36.019340] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
> [   36.021250] RIP: 0010:pagemap_pmd_range+0x5ae/0x7b0

So it looks like this has broken /proc/<pid>/pagemap because we can now
call the callbacks with a NULL vma if the region passed into
walk_page_range is (partially) outside the VMA range.

Somehow, in this situation, there is a region which has a PMD entry but
no corresponding VMA. So the pmd_entry callback is called but with
walk->vma==NULL.

The options for fixing this seem to be:
 a) Make the pagemap callback robust against a PMD entry without a VMA.
    For example treating it as a hole (as it would have been before this
    patch):

---8<---
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9442631fd4af..b6d819c4bbb2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1369,6 +1369,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	pte_t *pte, *orig_pte;
 	int err = 0;
 
+	if (!vma)
+		return pagemap_pte_hole(addr, end, walk);
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	ptl = pmd_trans_huge_lock(pmdp, vma);
 	if (ptl) {
---8<---

 b) Provide a flag (or another function) for walk_page_range() which
    restores the previous behaviour. Only those users that want to walk
    ranges without VMAs would then need to deal with NULL-vma returns.

---8<---
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 12004b097eae..519258e8fffa 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -61,6 +61,7 @@ struct mm_walk {
 	const struct mm_walk_ops *ops;
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
+	bool ignore_vma;
 	void *private;
 };

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 4139e9163aee..f2fccbc3cba8 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -38,7 +38,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 	do {
 again:
 		next = pmd_addr_end(addr, end);
-		if (pmd_none(*pmd)) {
+		if (pmd_none(*pmd) || (!walk->vma && walk->ignore_vma)) {
 			if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, walk);
 			if (err)
@@ -89,7 +89,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 	do {
  again:
 		next = pud_addr_end(addr, end);
-		if (pud_none(*pud)) {
+		if (pud_none(*pud) || (!walk->vma && !walk->ignore_vma)) {
 			if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, walk);
 			if (err)
---8<---

I'm currently inclined towards the latter because I don't want to have
to try to audit all existing users in case there's anything similar
lurking in another user of walk_page_range().

Steve
diff mbox series

Patch

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index fc4d98a3a5a0..4139e9163aee 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -38,7 +38,7 @@  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 	do {
 again:
 		next = pmd_addr_end(addr, end);
-		if (pmd_none(*pmd) || !walk->vma) {
+		if (pmd_none(*pmd)) {
 			if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, walk);
 			if (err)
@@ -61,9 +61,14 @@  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 		if (!ops->pte_entry)
 			continue;
 
-		split_huge_pmd(walk->vma, pmd, addr);
-		if (pmd_trans_unstable(pmd))
-			goto again;
+		if (walk->vma) {
+			split_huge_pmd(walk->vma, pmd, addr);
+			if (pmd_trans_unstable(pmd))
+				goto again;
+		} else if (pmd_leaf(*pmd)) {
+			continue;
+		}
+
 		err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
 			break;
@@ -84,7 +89,7 @@  static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 	do {
  again:
 		next = pud_addr_end(addr, end);
-		if (pud_none(*pud) || !walk->vma) {
+		if (pud_none(*pud)) {
 			if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, walk);
 			if (err)
@@ -98,9 +103,13 @@  static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 				break;
 		}
 
-		split_huge_pud(walk->vma, pud, addr);
-		if (pud_none(*pud))
-			goto again;
+		if (walk->vma) {
+			split_huge_pud(walk->vma, pud, addr);
+			if (pud_none(*pud))
+				goto again;
+		} else if (pud_leaf(*pud)) {
+			continue;
+		}
 
 		if (ops->pmd_entry || ops->pte_entry)
 			err = walk_pmd_range(pud, addr, next, walk);