diff mbox series

[v3,1/3] s390/pci: Fix s390_mmio_read/write syscall page fault handling

Message ID 20240529-vfio_pci_mmap-v3-1-cd217d019218@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series vfio/pci: s390: Fix issues preventing VFIO_PCI_MMAP=y for s390 and enable it | expand

Commit Message

Niklas Schnelle May 29, 2024, 11:36 a.m. UTC
The s390 MMIO syscalls when using the classic PCI instructions do not
cause a page fault when follow_pte() fails due to the page not being
present. Besides being a general deficiency this breaks vfio-pci's mmap()
handling once VFIO_PCI_MMAP gets enabled as this lazily maps on first
access. Fix this by following a failed follow_pte() with
fixup_user_page() and retrying the follow_pte().

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
---
 arch/s390/pci/pci_mmio.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

Comments

Niklas Schnelle June 11, 2024, 11:21 a.m. UTC | #1
On Wed, 2024-05-29 at 13:36 +0200, Niklas Schnelle wrote:
> The s390 MMIO syscalls when using the classic PCI instructions do not
> cause a page fault when follow_pte() fails due to the page not being
> present. Besides being a general deficiency this breaks vfio-pci's mmap()
> handling once VFIO_PCI_MMAP gets enabled as this lazily maps on first
> access. Fix this by following a failed follow_pte() with
> fixup_user_page() and retrying the follow_pte().
> 
> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
> Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
> ---
>  arch/s390/pci/pci_mmio.c | 18 +++++++++++++-----
>  1 file changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
> index 5398729bfe1b..80c21b1a101c 100644
> --- a/arch/s390/pci/pci_mmio.c
> +++ b/arch/s390/pci/pci_mmio.c
> @@ -170,8 +170,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
>  		goto out_unlock_mmap;
>  
>  	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> -	if (ret)
> -		goto out_unlock_mmap;
> +	if (ret) {
> +		fixup_user_fault(current->mm, mmio_addr, FAULT_FLAG_WRITE, NULL);
> +		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> +		if (ret)
> +			goto out_unlock_mmap;
> +	}
>  
>  	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
>  			(mmio_addr & ~PAGE_MASK));
> @@ -305,12 +309,16 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
>  	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
>  		goto out_unlock_mmap;
>  	ret = -EACCES;
> -	if (!(vma->vm_flags & VM_WRITE))
> +	if (!(vma->vm_flags & VM_READ))
>  		goto out_unlock_mmap;
>  
>  	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> -	if (ret)
> -		goto out_unlock_mmap;
> +	if (ret) {
> +		fixup_user_fault(current->mm, mmio_addr, 0, NULL);
> +		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> +		if (ret)
> +			goto out_unlock_mmap;
> +	}
>  
>  	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
>  			(mmio_addr & ~PAGE_MASK));
> 

Ughh, I think I just stumbled over a problem with this. This is a
failing lock held assertion via __is_vma_write_locked() in
remap_pfn_range_notrack() but I'm not sure yet what exactly causes this

[   67.338855] ------------[ cut here ]------------
[   67.338865] WARNING: CPU: 15 PID: 2056 at include/linux/rwsem.h:85 remap_pfn_range_notrack+0x596/0x5b0
[   67.338874] Modules linked in: <--- 8< --->
[   67.338931] CPU: 15 PID: 2056 Comm: vfio-test Not tainted 6.10.0-rc1-pci-pfault-00004-g193e3a513cee #5
[   67.338934] Hardware name: IBM 3931 A01 701 (LPAR)
[   67.338935] Krnl PSW : 0704c00180000000 000003e54c9730ea (remap_pfn_range_notrack+0x59a/0x5b0)
[   67.338940]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[   67.338944] Krnl GPRS: 0000000000000100 000003655915fb78 000002d80b9a5928 000003ff7fa00000
[   67.338946]            0004008000000000 0000000000004000 0000000000000711 000003ff7fa04000
[   67.338948]            000002d80c533f00 000002d800000100 000002d81bbe6c28 000002d80b9a5928
[   67.338950]            000003ff7fa00000 000002d80c533f00 000003e54c973120 000003655915fab0
[   67.338956] Krnl Code: 000003e54c9730de: a708ffea            lhi     %r0,-22
                          000003e54c9730e2: a7f4fff6            brc     15,000003e54c9730ce
                         #000003e54c9730e6: af000000            mc      0,0
                         >000003e54c9730ea: a7f4fd6e            brc     15,000003e54c972bc6
                          000003e54c9730ee: af000000            mc      0,0
                          000003e54c9730f2: af000000            mc      0,0
                          000003e54c9730f6: 0707                bcr     0,%r7
                          000003e54c9730f8: 0707                bcr     0,%r7
[   67.339025] Call Trace:
[   67.339027]  [<000003e54c9730ea>] remap_pfn_range_notrack+0x59a/0x5b0
[   67.339032]  [<000003e54c973120>] remap_pfn_range+0x20/0x30
[   67.339035]  [<000003e4cce5396c>] vfio_pci_mmap_fault+0xec/0x1d0 [vfio_pci_core]
[   67.339043]  [<000003e54c977240>] handle_mm_fault+0x6b0/0x25a0
[   67.339046]  [<000003e54c966328>] fixup_user_fault+0x138/0x310
[   67.339048]  [<000003e54c63a91c>] __s390x_sys_s390_pci_mmio_read+0x28c/0x3a0
[   67.339051]  [<000003e54c5e200a>] do_syscall+0xea/0x120
[   67.339055]  [<000003e54d5f9954>] __do_syscall+0x94/0x140
[   67.339059]  [<000003e54d611020>] system_call+0x70/0xa0
[   67.339063] Last Breaking-Event-Address:
[   67.339065]  [<000003e54c972bc2>] remap_pfn_range_notrack+0x72/0x5b0
[   67.339067] ---[ end trace 0000000000000000 ]---
Niklas Schnelle June 11, 2024, 12:08 p.m. UTC | #2
On Tue, 2024-06-11 at 13:21 +0200, Niklas Schnelle wrote:
> On Wed, 2024-05-29 at 13:36 +0200, Niklas Schnelle wrote:
> > The s390 MMIO syscalls when using the classic PCI instructions do not
> > cause a page fault when follow_pte() fails due to the page not being
> > present. Besides being a general deficiency this breaks vfio-pci's mmap()
> > handling once VFIO_PCI_MMAP gets enabled as this lazily maps on first
> > access. Fix this by following a failed follow_pte() with
> > fixup_user_page() and retrying the follow_pte().
> > 
> > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> > Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
> > Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
> > ---
> >  arch/s390/pci/pci_mmio.c | 18 +++++++++++++-----
> >  1 file changed, 13 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
> > index 5398729bfe1b..80c21b1a101c 100644
> > --- a/arch/s390/pci/pci_mmio.c
> > +++ b/arch/s390/pci/pci_mmio.c
> > @@ -170,8 +170,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
> >  		goto out_unlock_mmap;
> >  
> >  	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> > -	if (ret)
> > -		goto out_unlock_mmap;
> > +	if (ret) {
> > +		fixup_user_fault(current->mm, mmio_addr, FAULT_FLAG_WRITE, NULL);
> > +		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> > +		if (ret)
> > +			goto out_unlock_mmap;
> > +	}
> >  
> >  	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
> >  			(mmio_addr & ~PAGE_MASK));
> > @@ -305,12 +309,16 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
> >  	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
> >  		goto out_unlock_mmap;
> >  	ret = -EACCES;
> > -	if (!(vma->vm_flags & VM_WRITE))
> > +	if (!(vma->vm_flags & VM_READ))
> >  		goto out_unlock_mmap;
> >  
> >  	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> > -	if (ret)
> > -		goto out_unlock_mmap;
> > +	if (ret) {
> > +		fixup_user_fault(current->mm, mmio_addr, 0, NULL);
> > +		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> > +		if (ret)
> > +			goto out_unlock_mmap;
> > +	}
> >  
> >  	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
> >  			(mmio_addr & ~PAGE_MASK));
> > 
> 
> Ughh, I think I just stumbled over a problem with this. This is a
> failing lock held assertion via __is_vma_write_locked() in
> remap_pfn_range_notrack() but I'm not sure yet what exactly causes this
> 
> [   67.338855] ------------[ cut here ]------------
> [   67.338865] WARNING: CPU: 15 PID: 2056 at include/linux/rwsem.h:85 remap_pfn_range_notrack+0x596/0x5b0
> [   67.338874] Modules linked in: <--- 8< --->
> [   67.338931] CPU: 15 PID: 2056 Comm: vfio-test Not tainted 6.10.0-rc1-pci-pfault-00004-g193e3a513cee #5
> [   67.338934] Hardware name: IBM 3931 A01 701 (LPAR)
> [   67.338935] Krnl PSW : 0704c00180000000 000003e54c9730ea (remap_pfn_range_notrack+0x59a/0x5b0)
> [   67.338940]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> [   67.338944] Krnl GPRS: 0000000000000100 000003655915fb78 000002d80b9a5928 000003ff7fa00000
> [   67.338946]            0004008000000000 0000000000004000 0000000000000711 000003ff7fa04000
> [   67.338948]            000002d80c533f00 000002d800000100 000002d81bbe6c28 000002d80b9a5928
> [   67.338950]            000003ff7fa00000 000002d80c533f00 000003e54c973120 000003655915fab0
> [   67.338956] Krnl Code: 000003e54c9730de: a708ffea            lhi     %r0,-22
>                           000003e54c9730e2: a7f4fff6            brc     15,000003e54c9730ce
>                          #000003e54c9730e6: af000000            mc      0,0
>                          >000003e54c9730ea: a7f4fd6e            brc     15,000003e54c972bc6
>                           000003e54c9730ee: af000000            mc      0,0
>                           000003e54c9730f2: af000000            mc      0,0
>                           000003e54c9730f6: 0707                bcr     0,%r7
>                           000003e54c9730f8: 0707                bcr     0,%r7
> [   67.339025] Call Trace:
> [   67.339027]  [<000003e54c9730ea>] remap_pfn_range_notrack+0x59a/0x5b0
> [   67.339032]  [<000003e54c973120>] remap_pfn_range+0x20/0x30
> [   67.339035]  [<000003e4cce5396c>] vfio_pci_mmap_fault+0xec/0x1d0 [vfio_pci_core]
> [   67.339043]  [<000003e54c977240>] handle_mm_fault+0x6b0/0x25a0
> [   67.339046]  [<000003e54c966328>] fixup_user_fault+0x138/0x310
> [   67.339048]  [<000003e54c63a91c>] __s390x_sys_s390_pci_mmio_read+0x28c/0x3a0
> [   67.339051]  [<000003e54c5e200a>] do_syscall+0xea/0x120
> [   67.339055]  [<000003e54d5f9954>] __do_syscall+0x94/0x140
> [   67.339059]  [<000003e54d611020>] system_call+0x70/0xa0
> [   67.339063] Last Breaking-Event-Address:
> [   67.339065]  [<000003e54c972bc2>] remap_pfn_range_notrack+0x72/0x5b0
> [   67.339067] ---[ end trace 0000000000000000 ]---
> 

This has me a bit confused so far as __is_vma_write_locked() checks
mmap_assert_write_locked(vma->vm_mm) but most other users of
fixup_user_fault() hold mmap_read_lock() just like this code and
clearly in the non page fault case we only need the read lock.
Niklas Schnelle June 11, 2024, 1:23 p.m. UTC | #3
On Tue, 2024-06-11 at 14:08 +0200, Niklas Schnelle wrote:
> On Tue, 2024-06-11 at 13:21 +0200, Niklas Schnelle wrote:
> > On Wed, 2024-05-29 at 13:36 +0200, Niklas Schnelle wrote:
> > > The s390 MMIO syscalls when using the classic PCI instructions do not
> > > cause a page fault when follow_pte() fails due to the page not being
> > > present. Besides being a general deficiency this breaks vfio-pci's mmap()
> > > handling once VFIO_PCI_MMAP gets enabled as this lazily maps on first
> > > access. Fix this by following a failed follow_pte() with
> > > fixup_user_page() and retrying the follow_pte().
> > > 
> > > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
> > > Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
> > > Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
> > > ---
> > >  arch/s390/pci/pci_mmio.c | 18 +++++++++++++-----
> > >  1 file changed, 13 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
> > > index 5398729bfe1b..80c21b1a101c 100644
> > > --- a/arch/s390/pci/pci_mmio.c
> > > +++ b/arch/s390/pci/pci_mmio.c
> > > @@ -170,8 +170,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
> > >  		goto out_unlock_mmap;
> > >  
> > >  	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> > > -	if (ret)
> > > -		goto out_unlock_mmap;
> > > +	if (ret) {
> > > +		fixup_user_fault(current->mm, mmio_addr, FAULT_FLAG_WRITE, NULL);
> > > +		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> > > +		if (ret)
> > > +			goto out_unlock_mmap;
> > > +	}
> > >  
> > >  	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
> > >  			(mmio_addr & ~PAGE_MASK));
> > > @@ -305,12 +309,16 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
> > >  	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
> > >  		goto out_unlock_mmap;
> > >  	ret = -EACCES;
> > > -	if (!(vma->vm_flags & VM_WRITE))
> > > +	if (!(vma->vm_flags & VM_READ))
> > >  		goto out_unlock_mmap;
> > >  
> > >  	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> > > -	if (ret)
> > > -		goto out_unlock_mmap;
> > > +	if (ret) {
> > > +		fixup_user_fault(current->mm, mmio_addr, 0, NULL);
> > > +		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
> > > +		if (ret)
> > > +			goto out_unlock_mmap;
> > > +	}
> > >  
> > >  	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
> > >  			(mmio_addr & ~PAGE_MASK));
> > > 
> > 
> > Ughh, I think I just stumbled over a problem with this. This is a
> > failing lock held assertion via __is_vma_write_locked() in
> > remap_pfn_range_notrack() but I'm not sure yet what exactly causes this
> > 
> > [   67.338855] ------------[ cut here ]------------
> > [   67.338865] WARNING: CPU: 15 PID: 2056 at include/linux/rwsem.h:85 remap_pfn_range_notrack+0x596/0x5b0
> > [   67.338874] Modules linked in: <--- 8< --->
> > [   67.338931] CPU: 15 PID: 2056 Comm: vfio-test Not tainted 6.10.0-rc1-pci-pfault-00004-g193e3a513cee #5
> > [   67.338934] Hardware name: IBM 3931 A01 701 (LPAR)
> > [   67.338935] Krnl PSW : 0704c00180000000 000003e54c9730ea (remap_pfn_range_notrack+0x59a/0x5b0)
> > [   67.338940]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> > [   67.338944] Krnl GPRS: 0000000000000100 000003655915fb78 000002d80b9a5928 000003ff7fa00000
> > [   67.338946]            0004008000000000 0000000000004000 0000000000000711 000003ff7fa04000
> > [   67.338948]            000002d80c533f00 000002d800000100 000002d81bbe6c28 000002d80b9a5928
> > [   67.338950]            000003ff7fa00000 000002d80c533f00 000003e54c973120 000003655915fab0
> > [   67.338956] Krnl Code: 000003e54c9730de: a708ffea            lhi     %r0,-22
> >                           000003e54c9730e2: a7f4fff6            brc     15,000003e54c9730ce
> >                          #000003e54c9730e6: af000000            mc      0,0
> >                          >000003e54c9730ea: a7f4fd6e            brc     15,000003e54c972bc6
> >                           000003e54c9730ee: af000000            mc      0,0
> >                           000003e54c9730f2: af000000            mc      0,0
> >                           000003e54c9730f6: 0707                bcr     0,%r7
> >                           000003e54c9730f8: 0707                bcr     0,%r7
> > [   67.339025] Call Trace:
> > [   67.339027]  [<000003e54c9730ea>] remap_pfn_range_notrack+0x59a/0x5b0
> > [   67.339032]  [<000003e54c973120>] remap_pfn_range+0x20/0x30
> > [   67.339035]  [<000003e4cce5396c>] vfio_pci_mmap_fault+0xec/0x1d0 [vfio_pci_core]
> > [   67.339043]  [<000003e54c977240>] handle_mm_fault+0x6b0/0x25a0
> > [   67.339046]  [<000003e54c966328>] fixup_user_fault+0x138/0x310
> > [   67.339048]  [<000003e54c63a91c>] __s390x_sys_s390_pci_mmio_read+0x28c/0x3a0
> > [   67.339051]  [<000003e54c5e200a>] do_syscall+0xea/0x120
> > [   67.339055]  [<000003e54d5f9954>] __do_syscall+0x94/0x140
> > [   67.339059]  [<000003e54d611020>] system_call+0x70/0xa0
> > [   67.339063] Last Breaking-Event-Address:
> > [   67.339065]  [<000003e54c972bc2>] remap_pfn_range_notrack+0x72/0x5b0
> > [   67.339067] ---[ end trace 0000000000000000 ]---
> > 
> 
> This has me a bit confused so far as __is_vma_write_locked() checks
> mmap_assert_write_locked(vma->vm_mm) but most other users of
> fixup_user_fault() hold mmap_read_lock() just like this code and
> clearly in the non page fault case we only need the read lock.
> 

And it gets weirder, as I could have sworn that I properly tested this
on v1, I retested with v1 (tags/sent/vfio_pci_mmap-v1 on my
git.kernel.org/niks and based on v6.9) and there I don't get the above
warning. I also made sure that it's not caused by my change to
"current->mm" for v2. But I'm also not hitting the checks David moved
into follow_pte() so yeah not sure what's going on here.
David Hildenbrand June 11, 2024, 2:13 p.m. UTC | #4
On 11.06.24 15:23, Niklas Schnelle wrote:
> On Tue, 2024-06-11 at 14:08 +0200, Niklas Schnelle wrote:
>> On Tue, 2024-06-11 at 13:21 +0200, Niklas Schnelle wrote:
>>> On Wed, 2024-05-29 at 13:36 +0200, Niklas Schnelle wrote:
>>>> The s390 MMIO syscalls when using the classic PCI instructions do not
>>>> cause a page fault when follow_pte() fails due to the page not being
>>>> present. Besides being a general deficiency this breaks vfio-pci's mmap()
>>>> handling once VFIO_PCI_MMAP gets enabled as this lazily maps on first
>>>> access. Fix this by following a failed follow_pte() with
>>>> fixup_user_page() and retrying the follow_pte().
>>>>
>>>> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
>>>> Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
>>>> Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
>>>> ---
>>>>   arch/s390/pci/pci_mmio.c | 18 +++++++++++++-----
>>>>   1 file changed, 13 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
>>>> index 5398729bfe1b..80c21b1a101c 100644
>>>> --- a/arch/s390/pci/pci_mmio.c
>>>> +++ b/arch/s390/pci/pci_mmio.c
>>>> @@ -170,8 +170,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
>>>>   		goto out_unlock_mmap;
>>>>   
>>>>   	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
>>>> -	if (ret)
>>>> -		goto out_unlock_mmap;
>>>> +	if (ret) {
>>>> +		fixup_user_fault(current->mm, mmio_addr, FAULT_FLAG_WRITE, NULL);
>>>> +		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
>>>> +		if (ret)
>>>> +			goto out_unlock_mmap;
>>>> +	}
>>>>   
>>>>   	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
>>>>   			(mmio_addr & ~PAGE_MASK));
>>>> @@ -305,12 +309,16 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
>>>>   	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
>>>>   		goto out_unlock_mmap;
>>>>   	ret = -EACCES;
>>>> -	if (!(vma->vm_flags & VM_WRITE))
>>>> +	if (!(vma->vm_flags & VM_READ))
>>>>   		goto out_unlock_mmap;
>>>>   
>>>>   	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
>>>> -	if (ret)
>>>> -		goto out_unlock_mmap;
>>>> +	if (ret) {
>>>> +		fixup_user_fault(current->mm, mmio_addr, 0, NULL);
>>>> +		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
>>>> +		if (ret)
>>>> +			goto out_unlock_mmap;
>>>> +	}
>>>>   
>>>>   	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
>>>>   			(mmio_addr & ~PAGE_MASK));
>>>>
>>>
>>> Ughh, I think I just stumbled over a problem with this. This is a
>>> failing lock held assertion via __is_vma_write_locked() in
>>> remap_pfn_range_notrack() but I'm not sure yet what exactly causes this
>>>
>>> [   67.338855] ------------[ cut here ]------------
>>> [   67.338865] WARNING: CPU: 15 PID: 2056 at include/linux/rwsem.h:85 remap_pfn_range_notrack+0x596/0x5b0
>>> [   67.338874] Modules linked in: <--- 8< --->
>>> [   67.338931] CPU: 15 PID: 2056 Comm: vfio-test Not tainted 6.10.0-rc1-pci-pfault-00004-g193e3a513cee #5
>>> [   67.338934] Hardware name: IBM 3931 A01 701 (LPAR)
>>> [   67.338935] Krnl PSW : 0704c00180000000 000003e54c9730ea (remap_pfn_range_notrack+0x59a/0x5b0)
>>> [   67.338940]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
>>> [   67.338944] Krnl GPRS: 0000000000000100 000003655915fb78 000002d80b9a5928 000003ff7fa00000
>>> [   67.338946]            0004008000000000 0000000000004000 0000000000000711 000003ff7fa04000
>>> [   67.338948]            000002d80c533f00 000002d800000100 000002d81bbe6c28 000002d80b9a5928
>>> [   67.338950]            000003ff7fa00000 000002d80c533f00 000003e54c973120 000003655915fab0
>>> [   67.338956] Krnl Code: 000003e54c9730de: a708ffea            lhi     %r0,-22
>>>                            000003e54c9730e2: a7f4fff6            brc     15,000003e54c9730ce
>>>                           #000003e54c9730e6: af000000            mc      0,0
>>>                           >000003e54c9730ea: a7f4fd6e            brc     15,000003e54c972bc6
>>>                            000003e54c9730ee: af000000            mc      0,0
>>>                            000003e54c9730f2: af000000            mc      0,0
>>>                            000003e54c9730f6: 0707                bcr     0,%r7
>>>                            000003e54c9730f8: 0707                bcr     0,%r7
>>> [   67.339025] Call Trace:
>>> [   67.339027]  [<000003e54c9730ea>] remap_pfn_range_notrack+0x59a/0x5b0
>>> [   67.339032]  [<000003e54c973120>] remap_pfn_range+0x20/0x30
>>> [   67.339035]  [<000003e4cce5396c>] vfio_pci_mmap_fault+0xec/0x1d0 [vfio_pci_core]
>>> [   67.339043]  [<000003e54c977240>] handle_mm_fault+0x6b0/0x25a0
>>> [   67.339046]  [<000003e54c966328>] fixup_user_fault+0x138/0x310
>>> [   67.339048]  [<000003e54c63a91c>] __s390x_sys_s390_pci_mmio_read+0x28c/0x3a0
>>> [   67.339051]  [<000003e54c5e200a>] do_syscall+0xea/0x120
>>> [   67.339055]  [<000003e54d5f9954>] __do_syscall+0x94/0x140
>>> [   67.339059]  [<000003e54d611020>] system_call+0x70/0xa0
>>> [   67.339063] Last Breaking-Event-Address:
>>> [   67.339065]  [<000003e54c972bc2>] remap_pfn_range_notrack+0x72/0x5b0
>>> [   67.339067] ---[ end trace 0000000000000000 ]---
>>>
>>
>> This has me a bit confused so far as __is_vma_write_locked() checks
>> mmap_assert_write_locked(vma->vm_mm) but most other users of
>> fixup_user_fault() hold mmap_read_lock() just like this code and
>> clearly in the non page fault case we only need the read lock.

This is likely the 
vm_flags_set()->vma_start_write(vma)->__is_vma_write_locked()

which checks mmap_assert_write_locked().

Setting VMA flags would be racy with the mmap lock in read mode.


remap_pfn_range() documents: "this is only safe if the mm semaphore is 
held when called." which doesn't spell out if it needs to be held in 
write mode (which I think it does) :)


My best guess is: if you are using remap_pfn_range() from a fault 
handler (not during mmap time) you are doing something wrong, that's why 
you get that report.

vmf_insert_pfn() and friends might be better alternatives, that make 
sure that the VMA already received the proper VMA flags at mmap time.

>>
> 
> And it gets weirder, as I could have sworn that I properly tested this
> on v1, I retested with v1 (tags/sent/vfio_pci_mmap-v1 on my
> git.kernel.org/niks and based on v6.9) and there I don't get the above
> warning. I also made sure that it's not caused by my change to
> "current->mm" for v2. But I'm also not hitting the checks David moved
> into follow_pte() so yeah not sure what's going on here.


You mean the mmap_assert_locked()? Yeah, that only checks if you have it 
in read mode, but not in write mode.
Niklas Schnelle June 11, 2024, 2:47 p.m. UTC | #5
> > > 
--- 8< snip 8< ---
> > > > Ughh, I think I just stumbled over a problem with this. This is a
> > > > failing lock held assertion via __is_vma_write_locked() in
> > > > remap_pfn_range_notrack() but I'm not sure yet what exactly causes this
> > > > 
> > > > [   67.338855] ------------[ cut here ]------------
> > > > [   67.338865] WARNING: CPU: 15 PID: 2056 at include/linux/rwsem.h:85 remap_pfn_range_notrack+0x596/0x5b0
> > > > [   67.338874] Modules linked in: <--- 8< --->
> > > > [   67.338931] CPU: 15 PID: 2056 Comm: vfio-test Not tainted 6.10.0-rc1-pci-pfault-00004-g193e3a513cee #5
> > > > [   67.338934] Hardware name: IBM 3931 A01 701 (LPAR)
> > > > [   67.338935] Krnl PSW : 0704c00180000000 000003e54c9730ea (remap_pfn_range_notrack+0x59a/0x5b0)
> > > > [   67.338940]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> > > > [   67.338944] Krnl GPRS: 0000000000000100 000003655915fb78 000002d80b9a5928 000003ff7fa00000
> > > > [   67.338946]            0004008000000000 0000000000004000 0000000000000711 000003ff7fa04000
> > > > [   67.338948]            000002d80c533f00 000002d800000100 000002d81bbe6c28 000002d80b9a5928
> > > > [   67.338950]            000003ff7fa00000 000002d80c533f00 000003e54c973120 000003655915fab0
> > > > [   67.338956] Krnl Code: 000003e54c9730de: a708ffea            lhi     %r0,-22
> > > >                            000003e54c9730e2: a7f4fff6            brc     15,000003e54c9730ce
> > > >                           #000003e54c9730e6: af000000            mc      0,0
> > > >                           >000003e54c9730ea: a7f4fd6e            brc     15,000003e54c972bc6
> > > >                            000003e54c9730ee: af000000            mc      0,0
> > > >                            000003e54c9730f2: af000000            mc      0,0
> > > >                            000003e54c9730f6: 0707                bcr     0,%r7
> > > >                            000003e54c9730f8: 0707                bcr     0,%r7
> > > > [   67.339025] Call Trace:
> > > > [   67.339027]  [<000003e54c9730ea>] remap_pfn_range_notrack+0x59a/0x5b0
> > > > [   67.339032]  [<000003e54c973120>] remap_pfn_range+0x20/0x30
> > > > [   67.339035]  [<000003e4cce5396c>] vfio_pci_mmap_fault+0xec/0x1d0 [vfio_pci_core]
> > > > [   67.339043]  [<000003e54c977240>] handle_mm_fault+0x6b0/0x25a0
> > > > [   67.339046]  [<000003e54c966328>] fixup_user_fault+0x138/0x310
> > > > [   67.339048]  [<000003e54c63a91c>] __s390x_sys_s390_pci_mmio_read+0x28c/0x3a0
> > > > [   67.339051]  [<000003e54c5e200a>] do_syscall+0xea/0x120
> > > > [   67.339055]  [<000003e54d5f9954>] __do_syscall+0x94/0x140
> > > > [   67.339059]  [<000003e54d611020>] system_call+0x70/0xa0
> > > > [   67.339063] Last Breaking-Event-Address:
> > > > [   67.339065]  [<000003e54c972bc2>] remap_pfn_range_notrack+0x72/0x5b0
> > > > [   67.339067] ---[ end trace 0000000000000000 ]---
> > > > 
> > > 
> > > This has me a bit confused so far as __is_vma_write_locked() checks
> > > mmap_assert_write_locked(vma->vm_mm) but most other users of
> > > fixup_user_fault() hold mmap_read_lock() just like this code and
> > > clearly in the non page fault case we only need the read lock.
> 
> This is likely the 
> vm_flags_set()->vma_start_write(vma)->__is_vma_write_locked()

Yes

> 
> which checks mmap_assert_write_locked().
> 
> Setting VMA flags would be racy with the mmap lock in read mode.
> 
> 
> remap_pfn_range() documents: "this is only safe if the mm semaphore is 
> held when called." which doesn't spell out if it needs to be held in 
> write mode (which I think it does) :)

Logically this makes sense to me. At the same time it looks like
fixup_user_fault() expects the caller to only hold mmap_read_lock() as
I do here. In there it even retakes mmap_read_lock(). But then wouldn't
any fault handling by its nature need to hold the write lock?

> 
> 
> My best guess is: if you are using remap_pfn_range() from a fault 
> handler (not during mmap time) you are doing something wrong, that's why 
> you get that report.

@Alex: I guess so far the vfio_pci_mmap_fault() handler is only ever
triggered by "normal"/"actual" page faults where this isn't a problem?
Or could it be a problem there too?

> 
> vmf_insert_pfn() and friends might be better alternatives, that make 
> sure that the VMA already received the proper VMA flags at mmap time.
> 
> > > 
> > 
> > And it gets weirder, as I could have sworn that I properly tested this
> > on v1, I retested with v1 (tags/sent/vfio_pci_mmap-v1 on my
> > git.kernel.org/niks and based on v6.9) and there I don't get the above
> > warning. I also made sure that it's not caused by my change to
> > "current->mm" for v2. But I'm also not hitting the checks David moved
> > into follow_pte() so yeah not sure what's going on here.
> 
> 
> You mean the mmap_assert_locked()? Yeah, that only checks if you have it 
> in read mode, but not in write mode.
> 

Turns out this part was just me being stupid and not running the old
version with lockdep enabled when it "worked" and this only turned into
a normal warn with commit ba168b52bf8e ("mm: use rwsem assertion macros
for mmap_lock"). Rerunning v1 with lockdep on gave me an equivalent
lockdep splat.
David Hildenbrand June 11, 2024, 3:10 p.m. UTC | #6
>>
>> which checks mmap_assert_write_locked().
>>
>> Setting VMA flags would be racy with the mmap lock in read mode.
>>
>>
>> remap_pfn_range() documents: "this is only safe if the mm semaphore is
>> held when called." which doesn't spell out if it needs to be held in
>> write mode (which I think it does) :)
> 
> Logically this makes sense to me. At the same time it looks like
> fixup_user_fault() expects the caller to only hold mmap_read_lock() as
> I do here. In there it even retakes mmap_read_lock(). But then wouldn't
> any fault handling by its nature need to hold the write lock?

Well, if you're calling remap_pfn_range() right now the expectation is 
that we hold it in write mode. :)

Staring at some random users, they all call it from mmap(), where you 
hold the mmap lock in write mode.


I wonder why we are not seeing that splat with vfio all of the time?

That mmap lock check was added "recently". In 1c71222e5f23 we started 
using vm_flags_set(). That (including the mmap_assert_write_locked()) 
check was added via bc292ab00f6c almost 1.5 years ago.

Maybe vfio is a bit special and was never really run with lockdep?

> 
>>
>>
>> My best guess is: if you are using remap_pfn_range() from a fault
>> handler (not during mmap time) you are doing something wrong, that's why
>> you get that report.
> 
> @Alex: I guess so far the vfio_pci_mmap_fault() handler is only ever
> triggered by "normal"/"actual" page faults where this isn't a problem?
> Or could it be a problem there too?
> 

I think we should see it there as well, unless I am missing something.

>>
>> vmf_insert_pfn() and friends might be better alternatives, that make
>> sure that the VMA already received the proper VMA flags at mmap time.
>>


There would be ways of silencing that check: for example, making sure at 
mmap time that these flags are already set, and skipping modifications 
if the flags are already set.

But, we'll run into more similar checks in x86 VM_PAT code, where we 
would do vm_flags_set(vma, VM_PAT) from track_pfn_remap. Some of that 
code really doesn't want to be called concurrently (e.g., "vma->vm_pgoff 
= pfn;").

I thought that we silenced some of these warnings in the past using 
__vm_flags_mod(). But it sounds hacky.

CCing Sureen.
Niklas Schnelle June 11, 2024, 3:37 p.m. UTC | #7
On Tue, 2024-06-11 at 17:10 +0200, David Hildenbrand wrote:
> > > 
> > > which checks mmap_assert_write_locked().
> > > 
> > > Setting VMA flags would be racy with the mmap lock in read mode.
> > > 
> > > 
> > > remap_pfn_range() documents: "this is only safe if the mm semaphore is
> > > held when called." which doesn't spell out if it needs to be held in
> > > write mode (which I think it does) :)
> > 
> > Logically this makes sense to me. At the same time it looks like
> > fixup_user_fault() expects the caller to only hold mmap_read_lock() as
> > I do here. In there it even retakes mmap_read_lock(). But then wouldn't
> > any fault handling by its nature need to hold the write lock?
> 
> Well, if you're calling remap_pfn_range() right now the expectation is 
> that we hold it in write mode. :)
> 
> Staring at some random users, they all call it from mmap(), where you 
> hold the mmap lock in write mode.
> 
> 
> I wonder why we are not seeing that splat with vfio all of the time?
> 
> That mmap lock check was added "recently". In 1c71222e5f23 we started 
> using vm_flags_set(). That (including the mmap_assert_write_locked()) 
> check was added via bc292ab00f6c almost 1.5 years ago.
> 
> Maybe vfio is a bit special and was never really run with lockdep?
> 
> > 
> > > 
> > > 
> > > My best guess is: if you are using remap_pfn_range() from a fault
> > > handler (not during mmap time) you are doing something wrong, that's why
> > > you get that report.
> > 
> > @Alex: I guess so far the vfio_pci_mmap_fault() handler is only ever
> > triggered by "normal"/"actual" page faults where this isn't a problem?
> > Or could it be a problem there too?
> > 
> 
> I think we should see it there as well, unless I am missing something.

Well good news for me, bad news for everyone else. I just reproduced
the same problem on my x86_64 workstation. I "ported over" (hacked it
until it compiles) an x86 version of my trivial vfio-pci user-space
test code that mmaps() the BAR 0 of an NVMe and MMIO reads the NVMe
version field at offset 8. On my x86_64 box this leads to the following
splat (still on v6.10-rc1).

[  555.396773] ------------[ cut here ]------------
[  555.396774] WARNING: CPU: 3 PID: 1424 at include/linux/rwsem.h:85 remap_pfn_range_notrack+0x625/0x650
[  555.396778] Modules linked in: vfio_pci <-- 8< -->
[  555.396877] CPU: 3 PID: 1424 Comm: vfio-test Tainted: G        W          6.10.0-rc1-niks-00007-gb19d6d864df1 #4 d09afec01ce27ca8218580af28295f25e2d2ed53
[  555.396880] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X570 Creator, BIOS P3.40 01/28/2021
[  555.396881] RIP: 0010:remap_pfn_range_notrack+0x625/0x650
[  555.396884] Code: a8 00 00 00 75 39 44 89 e0 48 81 c4 b0 00 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d e9 26 a7 e5 00 cc 0f 0b 41 bc ea ff ff ff eb c9 <0f> 0b 49 8b 47 10 e9 72 fa ff ff e8 8b 56 b5 ff e9 c0 fa ff ff e8
[  555.396887] RSP: 0000:ffffaf8b04ed3bc0 EFLAGS: 00010246
[  555.396889] RAX: ffff9ea747cfe300 RBX: 00000000000ee200 RCX: 0000000000000100
[  555.396890] RDX: 00000000000ee200 RSI: ffff9ea747cfe300 RDI: ffff9ea76db58fd0
[  555.396892] RBP: 00000000ffffffea R08: 8000000000000035 R09: 0000000000000000
[  555.396894] R10: ffff9ea76d9bbf40 R11: ffffffff96e5ce50 R12: 0000000000004000
[  555.396895] R13: 00007f23b988a000 R14: ffff9ea76db58fd0 R15: ffff9ea76db58fd0
[  555.396897] FS:  00007f23b9561740(0000) GS:ffff9eb66e780000(0000) knlGS:0000000000000000
[  555.396899] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  555.396901] CR2: 00007f23b988a008 CR3: 0000000136bde000 CR4: 0000000000350ef0
[  555.396903] Call Trace:
[  555.396904]  <TASK>
[  555.396905]  ? __warn+0x18c/0x2a0
[  555.396908]  ? remap_pfn_range_notrack+0x625/0x650
[  555.396911]  ? report_bug+0x1bb/0x270
[  555.396915]  ? handle_bug+0x42/0x70
[  555.396917]  ? exc_invalid_op+0x1a/0x50
[  555.396920]  ? asm_exc_invalid_op+0x1a/0x20
[  555.396923]  ? __pfx_is_ISA_range+0x10/0x10
[  555.396926]  ? remap_pfn_range_notrack+0x625/0x650
[  555.396929]  ? asm_exc_invalid_op+0x1a/0x20
[  555.396933]  ? track_pfn_remap+0x170/0x180
[  555.396936]  remap_pfn_range+0x6f/0xc0
[  555.396940]  vfio_pci_mmap_fault+0xf3/0x1b0 [vfio_pci_core 6df3b7ac5dcecb63cb090734847a65c799a8fef2]
[  555.396946]  __do_fault+0x11b/0x210
[  555.396949]  do_pte_missing+0x239/0x1350
[  555.396953]  handle_mm_fault+0xb10/0x18b0
[  555.396959]  do_user_addr_fault+0x293/0x710
[  555.396963]  exc_page_fault+0x82/0x1c0
[  555.396966]  asm_exc_page_fault+0x26/0x30
[  555.396968] RIP: 0033:0x55b0ea8bb7ac
[  555.396972] Code: 00 00 b0 00 e8 e5 f8 ff ff 31 c0 48 83 c4 20 5d c3 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 89 7d f8 48 8b 45 f8 <8b> 00 89 c0 5d c3 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48
[  555.396974] RSP: 002b:00007fff80973530 EFLAGS: 00010202
[  555.396976] RAX: 00007f23b988a008 RBX: 00007fff80973738 RCX: 00007f23b988a000
[  555.396978] RDX: 0000000000000001 RSI: 00007fff809735e8 RDI: 00007f23b988a008
[  555.396979] RBP: 00007fff80973530 R08: 0000000000000005 R09: 0000000000000000
[  555.396981] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000002
[  555.396982] R13: 0000000000000000 R14: 00007f23b98c8000 R15: 000055b0ea8bddc0
[  555.396986]  </TASK>
[  555.396987] ---[ end trace 0000000000000000 ]---
Niklas Schnelle June 11, 2024, 3:56 p.m. UTC | #8
On Tue, 2024-06-11 at 17:10 +0200, David Hildenbrand wrote:
> > > 
> > > which checks mmap_assert_write_locked().
> > > 
> > > Setting VMA flags would be racy with the mmap lock in read mode.
> > > 
> > > 
> > > remap_pfn_range() documents: "this is only safe if the mm semaphore is
> > > held when called." which doesn't spell out if it needs to be held in
> > > write mode (which I think it does) :)
> > 
> > Logically this makes sense to me. At the same time it looks like
> > fixup_user_fault() expects the caller to only hold mmap_read_lock() as
> > I do here. In there it even retakes mmap_read_lock(). But then wouldn't
> > any fault handling by its nature need to hold the write lock?
> 
> Well, if you're calling remap_pfn_range() right now the expectation is 
> that we hold it in write mode. :)
> 
> Staring at some random users, they all call it from mmap(), where you 
> hold the mmap lock in write mode.
> 
> 
> I wonder why we are not seeing that splat with vfio all of the time?
> 
> That mmap lock check was added "recently". In 1c71222e5f23 we started 
> using vm_flags_set(). That (including the mmap_assert_write_locked()) 
> check was added via bc292ab00f6c almost 1.5 years ago.
> 
> Maybe vfio is a bit special and was never really run with lockdep?
> 
> > 
> > > 
> > > 
> > > My best guess is: if you are using remap_pfn_range() from a fault
> > > handler (not during mmap time) you are doing something wrong, that's why
> > > you get that report.
> > 
> > @Alex: I guess so far the vfio_pci_mmap_fault() handler is only ever
> > triggered by "normal"/"actual" page faults where this isn't a problem?
> > Or could it be a problem there too?
> > 
> 
> I think we should see it there as well, unless I am missing something.
> 
> > > 
> > > vmf_insert_pfn() and friends might be better alternatives, that make
> > > sure that the VMA already received the proper VMA flags at mmap time.
> > > 
> 
> 
> There would be ways of silencing that check: for example, making sure at 
> mmap time that these flags are already set, and skipping modifications 
> if the flags are already set.
> 
> But, we'll run into more similar checks in x86 VM_PAT code, where we 
> would do vm_flags_set(vma, VM_PAT) from track_pfn_remap. Some of that 
> code really doesn't want to be called concurrently (e.g., "vma->vm_pgoff 
> = pfn;").
> 
> I thought that we silenced some of these warnings in the past using 
> __vm_flags_mod(). But it sounds hacky.
> 
> CCing Sureen.
> 

Before I forget it, thanks a lot for your incredible help David!
Alex Williamson June 11, 2024, 10:21 p.m. UTC | #9
On Tue, 11 Jun 2024 17:37:20 +0200
Niklas Schnelle <schnelle@linux.ibm.com> wrote:

> On Tue, 2024-06-11 at 17:10 +0200, David Hildenbrand wrote:
> > > > 
> > > > which checks mmap_assert_write_locked().
> > > > 
> > > > Setting VMA flags would be racy with the mmap lock in read mode.
> > > > 
> > > > 
> > > > remap_pfn_range() documents: "this is only safe if the mm semaphore is
> > > > held when called." which doesn't spell out if it needs to be held in
> > > > write mode (which I think it does) :)  
> > > 
> > > Logically this makes sense to me. At the same time it looks like
> > > fixup_user_fault() expects the caller to only hold mmap_read_lock() as
> > > I do here. In there it even retakes mmap_read_lock(). But then wouldn't
> > > any fault handling by its nature need to hold the write lock?  
> > 
> > Well, if you're calling remap_pfn_range() right now the expectation is 
> > that we hold it in write mode. :)
> > 
> > Staring at some random users, they all call it from mmap(), where you 
> > hold the mmap lock in write mode.
> > 
> > 
> > I wonder why we are not seeing that splat with vfio all of the time?
> > 
> > That mmap lock check was added "recently". In 1c71222e5f23 we started 
> > using vm_flags_set(). That (including the mmap_assert_write_locked()) 
> > check was added via bc292ab00f6c almost 1.5 years ago.
> > 
> > Maybe vfio is a bit special and was never really run with lockdep?
> >   
> > >   
> > > > 
> > > > 
> > > > My best guess is: if you are using remap_pfn_range() from a fault
> > > > handler (not during mmap time) you are doing something wrong, that's why
> > > > you get that report.  
> > > 
> > > @Alex: I guess so far the vfio_pci_mmap_fault() handler is only ever
> > > triggered by "normal"/"actual" page faults where this isn't a problem?
> > > Or could it be a problem there too?
> > >   
> > 
> > I think we should see it there as well, unless I am missing something.  
> 
> Well good news for me, bad news for everyone else. I just reproduced
> the same problem on my x86_64 workstation. I "ported over" (hacked it
> until it compiles) an x86 version of my trivial vfio-pci user-space
> test code that mmaps() the BAR 0 of an NVMe and MMIO reads the NVMe
> version field at offset 8. On my x86_64 box this leads to the following
> splat (still on v6.10-rc1).

There's already a fix for this queued[1] in my for-linus branch for
v6.10.  The problem has indeed existed with lockdep for some time but
only with the recent lockdep changes to generate a warning regardless
of debug kernel settings has it gone from just sketchy to having a fire
under it.  There's still an outstanding question of whether we
can/should insert as many pfns as we can during the fault[2] to reduce
the new overhead and hopefully at some point we'll have an even cleaner
option to use huge_fault for pfnmaps, but currently
vmf_insert_pfn_{pmd,pud} don't work with those pfnmaps.

So hopefully this problem disappears on current linux-next, but let me
know if there's still an issue.  Thanks,

Alex

[1]https://lore.kernel.org/all/20240530045236.1005864-1-alex.williamson@redhat.com/
[2]https://lore.kernel.org/all/20240607035213.2054226-1-alex.williamson@redhat.com/

> [  555.396773] ------------[ cut here ]------------
> [  555.396774] WARNING: CPU: 3 PID: 1424 at include/linux/rwsem.h:85 remap_pfn_range_notrack+0x625/0x650
> [  555.396778] Modules linked in: vfio_pci <-- 8< -->
> [  555.396877] CPU: 3 PID: 1424 Comm: vfio-test Tainted: G        W          6.10.0-rc1-niks-00007-gb19d6d864df1 #4 d09afec01ce27ca8218580af28295f25e2d2ed53
> [  555.396880] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X570 Creator, BIOS P3.40 01/28/2021
> [  555.396881] RIP: 0010:remap_pfn_range_notrack+0x625/0x650
> [  555.396884] Code: a8 00 00 00 75 39 44 89 e0 48 81 c4 b0 00 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d e9 26 a7 e5 00 cc 0f 0b 41 bc ea ff ff ff eb c9 <0f> 0b 49 8b 47 10 e9 72 fa ff ff e8 8b 56 b5 ff e9 c0 fa ff ff e8
> [  555.396887] RSP: 0000:ffffaf8b04ed3bc0 EFLAGS: 00010246
> [  555.396889] RAX: ffff9ea747cfe300 RBX: 00000000000ee200 RCX: 0000000000000100
> [  555.396890] RDX: 00000000000ee200 RSI: ffff9ea747cfe300 RDI: ffff9ea76db58fd0
> [  555.396892] RBP: 00000000ffffffea R08: 8000000000000035 R09: 0000000000000000
> [  555.396894] R10: ffff9ea76d9bbf40 R11: ffffffff96e5ce50 R12: 0000000000004000
> [  555.396895] R13: 00007f23b988a000 R14: ffff9ea76db58fd0 R15: ffff9ea76db58fd0
> [  555.396897] FS:  00007f23b9561740(0000) GS:ffff9eb66e780000(0000) knlGS:0000000000000000
> [  555.396899] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  555.396901] CR2: 00007f23b988a008 CR3: 0000000136bde000 CR4: 0000000000350ef0
> [  555.396903] Call Trace:
> [  555.396904]  <TASK>
> [  555.396905]  ? __warn+0x18c/0x2a0
> [  555.396908]  ? remap_pfn_range_notrack+0x625/0x650
> [  555.396911]  ? report_bug+0x1bb/0x270
> [  555.396915]  ? handle_bug+0x42/0x70
> [  555.396917]  ? exc_invalid_op+0x1a/0x50
> [  555.396920]  ? asm_exc_invalid_op+0x1a/0x20
> [  555.396923]  ? __pfx_is_ISA_range+0x10/0x10
> [  555.396926]  ? remap_pfn_range_notrack+0x625/0x650
> [  555.396929]  ? asm_exc_invalid_op+0x1a/0x20
> [  555.396933]  ? track_pfn_remap+0x170/0x180
> [  555.396936]  remap_pfn_range+0x6f/0xc0
> [  555.396940]  vfio_pci_mmap_fault+0xf3/0x1b0 [vfio_pci_core 6df3b7ac5dcecb63cb090734847a65c799a8fef2]
> [  555.396946]  __do_fault+0x11b/0x210
> [  555.396949]  do_pte_missing+0x239/0x1350
> [  555.396953]  handle_mm_fault+0xb10/0x18b0
> [  555.396959]  do_user_addr_fault+0x293/0x710
> [  555.396963]  exc_page_fault+0x82/0x1c0
> [  555.396966]  asm_exc_page_fault+0x26/0x30
> [  555.396968] RIP: 0033:0x55b0ea8bb7ac
> [  555.396972] Code: 00 00 b0 00 e8 e5 f8 ff ff 31 c0 48 83 c4 20 5d c3 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 89 7d f8 48 8b 45 f8 <8b> 00 89 c0 5d c3 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48
> [  555.396974] RSP: 002b:00007fff80973530 EFLAGS: 00010202
> [  555.396976] RAX: 00007f23b988a008 RBX: 00007fff80973738 RCX: 00007f23b988a000
> [  555.396978] RDX: 0000000000000001 RSI: 00007fff809735e8 RDI: 00007f23b988a008
> [  555.396979] RBP: 00007fff80973530 R08: 0000000000000005 R09: 0000000000000000
> [  555.396981] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000002
> [  555.396982] R13: 0000000000000000 R14: 00007f23b98c8000 R15: 000055b0ea8bddc0
> [  555.396986]  </TASK>
> [  555.396987] ---[ end trace 0000000000000000 ]---
>
David Hildenbrand June 12, 2024, 7:28 a.m. UTC | #10
On 12.06.24 00:21, Alex Williamson wrote:
> On Tue, 11 Jun 2024 17:37:20 +0200
> Niklas Schnelle <schnelle@linux.ibm.com> wrote:
> 
>> On Tue, 2024-06-11 at 17:10 +0200, David Hildenbrand wrote:
>>>>>
>>>>> which checks mmap_assert_write_locked().
>>>>>
>>>>> Setting VMA flags would be racy with the mmap lock in read mode.
>>>>>
>>>>>
>>>>> remap_pfn_range() documents: "this is only safe if the mm semaphore is
>>>>> held when called." which doesn't spell out if it needs to be held in
>>>>> write mode (which I think it does) :)
>>>>
>>>> Logically this makes sense to me. At the same time it looks like
>>>> fixup_user_fault() expects the caller to only hold mmap_read_lock() as
>>>> I do here. In there it even retakes mmap_read_lock(). But then wouldn't
>>>> any fault handling by its nature need to hold the write lock?
>>>
>>> Well, if you're calling remap_pfn_range() right now the expectation is
>>> that we hold it in write mode. :)
>>>
>>> Staring at some random users, they all call it from mmap(), where you
>>> hold the mmap lock in write mode.
>>>
>>>
>>> I wonder why we are not seeing that splat with vfio all of the time?
>>>
>>> That mmap lock check was added "recently". In 1c71222e5f23 we started
>>> using vm_flags_set(). That (including the mmap_assert_write_locked())
>>> check was added via bc292ab00f6c almost 1.5 years ago.
>>>
>>> Maybe vfio is a bit special and was never really run with lockdep?
>>>    
>>>>    
>>>>>
>>>>>
>>>>> My best guess is: if you are using remap_pfn_range() from a fault
>>>>> handler (not during mmap time) you are doing something wrong, that's why
>>>>> you get that report.
>>>>
>>>> @Alex: I guess so far the vfio_pci_mmap_fault() handler is only ever
>>>> triggered by "normal"/"actual" page faults where this isn't a problem?
>>>> Or could it be a problem there too?
>>>>    
>>>
>>> I think we should see it there as well, unless I am missing something.
>>
>> Well good news for me, bad news for everyone else. I just reproduced
>> the same problem on my x86_64 workstation. I "ported over" (hacked it
>> until it compiles) an x86 version of my trivial vfio-pci user-space
>> test code that mmaps() the BAR 0 of an NVMe and MMIO reads the NVMe
>> version field at offset 8. On my x86_64 box this leads to the following
>> splat (still on v6.10-rc1).
> 
> There's already a fix for this queued[1] in my for-linus branch for
> v6.10.  The problem has indeed existed with lockdep for some time but
> only with the recent lockdep changes to generate a warning regardless
> of debug kernel settings has it gone from just sketchy to having a fire
> under it.  There's still an outstanding question of whether we
> can/should insert as many pfns as we can during the fault[2] to reduce
> the new overhead and hopefully at some point we'll have an even cleaner
> option to use huge_fault for pfnmaps, but currently
> vmf_insert_pfn_{pmd,pud} don't work with those pfnmaps.
> 
> So hopefully this problem disappears on current linux-next, but let me
> know if there's still an issue.  Thanks,

I see us now using vmf_insert_pfn(), which should be the right thing to 
do. So I suspect this problem should be disappearing.
diff mbox series

Patch

diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index 5398729bfe1b..80c21b1a101c 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -170,8 +170,12 @@  SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
 		goto out_unlock_mmap;
 
 	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
-	if (ret)
-		goto out_unlock_mmap;
+	if (ret) {
+		fixup_user_fault(current->mm, mmio_addr, FAULT_FLAG_WRITE, NULL);
+		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
+		if (ret)
+			goto out_unlock_mmap;
+	}
 
 	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
 			(mmio_addr & ~PAGE_MASK));
@@ -305,12 +309,16 @@  SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
 	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 		goto out_unlock_mmap;
 	ret = -EACCES;
-	if (!(vma->vm_flags & VM_WRITE))
+	if (!(vma->vm_flags & VM_READ))
 		goto out_unlock_mmap;
 
 	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
-	if (ret)
-		goto out_unlock_mmap;
+	if (ret) {
+		fixup_user_fault(current->mm, mmio_addr, 0, NULL);
+		ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
+		if (ret)
+			goto out_unlock_mmap;
+	}
 
 	io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
 			(mmio_addr & ~PAGE_MASK));