[2/5] vfio/spapr_tce: use pinned_vm instead of locked_vm to account pinned pages
diff mbox series

Message ID 20190211224437.25267-3-daniel.m.jordan@oracle.com
State New
Headers show
Series
  • use pinned_vm instead of locked_vm to account pinned pages
Related show

Commit Message

Daniel Jordan Feb. 11, 2019, 10:44 p.m. UTC
Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
pages"), locked and pinned pages are accounted separately.  The SPAPR
TCE VFIO IOMMU driver accounts pinned pages to locked_vm; use pinned_vm
instead.

pinned_vm recently became atomic and so no longer relies on mmap_sem
held as writer: delete.

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 Documentation/vfio.txt              |  6 +--
 drivers/vfio/vfio_iommu_spapr_tce.c | 64 ++++++++++++++---------------
 2 files changed, 33 insertions(+), 37 deletions(-)

Comments

Alexey Kardashevskiy Feb. 12, 2019, 6:56 a.m. UTC | #1
On 12/02/2019 09:44, Daniel Jordan wrote:
> Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
> pages"), locked and pinned pages are accounted separately.  The SPAPR
> TCE VFIO IOMMU driver accounts pinned pages to locked_vm; use pinned_vm
> instead.
> 
> pinned_vm recently became atomic and so no longer relies on mmap_sem
> held as writer: delete.
> 
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> ---
>  Documentation/vfio.txt              |  6 +--
>  drivers/vfio/vfio_iommu_spapr_tce.c | 64 ++++++++++++++---------------
>  2 files changed, 33 insertions(+), 37 deletions(-)
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index f1a4d3c3ba0b..fa37d65363f9 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -308,7 +308,7 @@ This implementation has some specifics:
>     currently there is no way to reduce the number of calls. In order to make
>     things faster, the map/unmap handling has been implemented in real mode
>     which provides an excellent performance which has limitations such as
> -   inability to do locked pages accounting in real time.
> +   inability to do pinned pages accounting in real time.
>  
>  4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
>     subtree that can be treated as a unit for the purposes of partitioning and
> @@ -324,7 +324,7 @@ This implementation has some specifics:
>  		returns the size and the start of the DMA window on the PCI bus.
>  
>  	VFIO_IOMMU_ENABLE
> -		enables the container. The locked pages accounting
> +		enables the container. The pinned pages accounting
>  		is done at this point. This lets user first to know what
>  		the DMA window is and adjust rlimit before doing any real job.
>  
> @@ -454,7 +454,7 @@ This implementation has some specifics:
>  
>     PPC64 paravirtualized guests generate a lot of map/unmap requests,
>     and the handling of those includes pinning/unpinning pages and updating
> -   mm::locked_vm counter to make sure we do not exceed the rlimit.
> +   mm::pinned_vm counter to make sure we do not exceed the rlimit.
>     The v2 IOMMU splits accounting and pinning into separate operations:
>  
>     - VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index c424913324e3..f47e020dc5e4 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -34,9 +34,11 @@
>  static void tce_iommu_detach_group(void *iommu_data,
>  		struct iommu_group *iommu_group);
>  
> -static long try_increment_locked_vm(struct mm_struct *mm, long npages)
> +static long try_increment_pinned_vm(struct mm_struct *mm, long npages)
>  {
> -	long ret = 0, locked, lock_limit;
> +	long ret = 0;
> +	s64 pinned;
> +	unsigned long lock_limit;
>  
>  	if (WARN_ON_ONCE(!mm))
>  		return -EPERM;
> @@ -44,39 +46,33 @@ static long try_increment_locked_vm(struct mm_struct *mm, long npages)
>  	if (!npages)
>  		return 0;
>  
> -	down_write(&mm->mmap_sem);
> -	locked = mm->locked_vm + npages;
> +	pinned = atomic64_add_return(npages, &mm->pinned_vm);
>  	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> -	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
> +	if (pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
>  		ret = -ENOMEM;
> -	else
> -		mm->locked_vm += npages;
> +		atomic64_sub(npages, &mm->pinned_vm);
> +	}
>  
> -	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
> +	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%lu%s\n", current->pid,
>  			npages << PAGE_SHIFT,
> -			mm->locked_vm << PAGE_SHIFT,
> -			rlimit(RLIMIT_MEMLOCK),
> -			ret ? " - exceeded" : "");
> -
> -	up_write(&mm->mmap_sem);
> +			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
> +			rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
>  
>  	return ret;
>  }
>  
> -static void decrement_locked_vm(struct mm_struct *mm, long npages)
> +static void decrement_pinned_vm(struct mm_struct *mm, long npages)
>  {
>  	if (!mm || !npages)
>  		return;
>  
> -	down_write(&mm->mmap_sem);
> -	if (WARN_ON_ONCE(npages > mm->locked_vm))
> -		npages = mm->locked_vm;
> -	mm->locked_vm -= npages;
> -	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
> +	if (WARN_ON_ONCE(npages > atomic64_read(&mm->pinned_vm)))
> +		npages = atomic64_read(&mm->pinned_vm);
> +	atomic64_sub(npages, &mm->pinned_vm);
> +	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%lu\n", current->pid,
>  			npages << PAGE_SHIFT,
> -			mm->locked_vm << PAGE_SHIFT,
> +			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
>  			rlimit(RLIMIT_MEMLOCK));
> -	up_write(&mm->mmap_sem);


So it used to be down_write+up_write and stuff in between.

Now it is 3 independent accesses (actually 4 but the last one is
diagnostic) with no locking around them. Why do not we need a lock
anymore precisely? Thanks,




>  }
>  
>  /*
> @@ -110,7 +106,7 @@ struct tce_container {
>  	bool enabled;
>  	bool v2;
>  	bool def_window_pending;
> -	unsigned long locked_pages;
> +	unsigned long pinned_pages;
>  	struct mm_struct *mm;
>  	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>  	struct list_head group_list;
> @@ -283,7 +279,7 @@ static int tce_iommu_find_free_table(struct tce_container *container)
>  static int tce_iommu_enable(struct tce_container *container)
>  {
>  	int ret = 0;
> -	unsigned long locked;
> +	unsigned long pinned;
>  	struct iommu_table_group *table_group;
>  	struct tce_iommu_group *tcegrp;
>  
> @@ -292,15 +288,15 @@ static int tce_iommu_enable(struct tce_container *container)
>  
>  	/*
>  	 * When userspace pages are mapped into the IOMMU, they are effectively
> -	 * locked memory, so, theoretically, we need to update the accounting
> -	 * of locked pages on each map and unmap.  For powerpc, the map unmap
> +	 * pinned memory, so, theoretically, we need to update the accounting
> +	 * of pinned pages on each map and unmap.  For powerpc, the map unmap
>  	 * paths can be very hot, though, and the accounting would kill
>  	 * performance, especially since it would be difficult to impossible
>  	 * to handle the accounting in real mode only.
>  	 *
>  	 * To address that, rather than precisely accounting every page, we
> -	 * instead account for a worst case on locked memory when the iommu is
> -	 * enabled and disabled.  The worst case upper bound on locked memory
> +	 * instead account for a worst case on pinned memory when the iommu is
> +	 * enabled and disabled.  The worst case upper bound on pinned memory
>  	 * is the size of the whole iommu window, which is usually relatively
>  	 * small (compared to total memory sizes) on POWER hardware.
>  	 *
> @@ -317,7 +313,7 @@ static int tce_iommu_enable(struct tce_container *container)
>  	 *
>  	 * So we do not allow enabling a container without a group attached
>  	 * as there is no way to know how much we should increment
> -	 * the locked_vm counter.
> +	 * the pinned_vm counter.
>  	 */
>  	if (!tce_groups_attached(container))
>  		return -ENODEV;
> @@ -335,12 +331,12 @@ static int tce_iommu_enable(struct tce_container *container)
>  	if (ret)
>  		return ret;
>  
> -	locked = table_group->tce32_size >> PAGE_SHIFT;
> -	ret = try_increment_locked_vm(container->mm, locked);
> +	pinned = table_group->tce32_size >> PAGE_SHIFT;
> +	ret = try_increment_pinned_vm(container->mm, pinned);
>  	if (ret)
>  		return ret;
>  
> -	container->locked_pages = locked;
> +	container->pinned_pages = pinned;
>  
>  	container->enabled = true;
>  
> @@ -355,7 +351,7 @@ static void tce_iommu_disable(struct tce_container *container)
>  	container->enabled = false;
>  
>  	BUG_ON(!container->mm);
> -	decrement_locked_vm(container->mm, container->locked_pages);
> +	decrement_pinned_vm(container->mm, container->pinned_pages);
>  }
>  
>  static void *tce_iommu_open(unsigned long arg)
> @@ -658,7 +654,7 @@ static long tce_iommu_create_table(struct tce_container *container,
>  	if (!table_size)
>  		return -EINVAL;
>  
> -	ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
> +	ret = try_increment_pinned_vm(container->mm, table_size >> PAGE_SHIFT);
>  	if (ret)
>  		return ret;
>  
> @@ -677,7 +673,7 @@ static void tce_iommu_free_table(struct tce_container *container,
>  	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
>  
>  	iommu_tce_table_put(tbl);
> -	decrement_locked_vm(container->mm, pages);
> +	decrement_pinned_vm(container->mm, pages);
>  }
>  
>  static long tce_iommu_create_window(struct tce_container *container,
>
Christopher Lameter Feb. 12, 2019, 4:50 p.m. UTC | #2
On Tue, 12 Feb 2019, Alexey Kardashevskiy wrote:

> Now it is 3 independent accesses (actually 4 but the last one is
> diagnostic) with no locking around them. Why do not we need a lock
> anymore precisely? Thanks,

Updating a regular counter is racy and requires a lock. It was converted
to be an atomic which can be incremented without a race.
Daniel Jordan Feb. 12, 2019, 5:18 p.m. UTC | #3
On Tue, Feb 12, 2019 at 04:50:11PM +0000, Christopher Lameter wrote:
> On Tue, 12 Feb 2019, Alexey Kardashevskiy wrote:
> 
> > Now it is 3 independent accesses (actually 4 but the last one is
> > diagnostic) with no locking around them. Why do not we need a lock
> > anymore precisely? Thanks,
> 
> Updating a regular counter is racy and requires a lock. It was converted
> to be an atomic which can be incremented without a race.

Yes, though Alexey may have meant that the multiple reads of the atomic in
decrement_pinned_vm are racy.  It only matters when there's a bug that would
make the counter go negative, but it's there.

And FWIW the debug print in try_increment_pinned_vm is also racy.

This fixes all that.  It doesn't try to correct the negative pinned_vm as the
old code did because it's already a bug and adjusting the value by the negative
amount seems to do nothing but make debugging harder.

If it's ok, I'll respin the whole series this way (another point for common
helper)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index f47e020dc5e4..b79257304de6 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -53,25 +53,24 @@ static long try_increment_pinned_vm(struct mm_struct *mm, long npages)
 		atomic64_sub(npages, &mm->pinned_vm);
 	}
 
-	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%lu%s\n", current->pid,
-			npages << PAGE_SHIFT,
-			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
+	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %lld/%lu%s\n", current->pid,
+			npages << PAGE_SHIFT, pinned << PAGE_SHIFT,
+			lock_limit, ret ? " - exceeded" : "");
 
 	return ret;
 }
 
 static void decrement_pinned_vm(struct mm_struct *mm, long npages)
 {
+	s64 pinned;
+
 	if (!mm || !npages)
 		return;
 
-	if (WARN_ON_ONCE(npages > atomic64_read(&mm->pinned_vm)))
-		npages = atomic64_read(&mm->pinned_vm);
-	atomic64_sub(npages, &mm->pinned_vm);
-	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%lu\n", current->pid,
-			npages << PAGE_SHIFT,
-			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
+	pinned = atomic64_sub_return(npages, &mm->pinned_vm);
+	WARN_ON_ONCE(pinned < 0);
+	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %lld/%lu\n", current->pid,
+			npages << PAGE_SHIFT, pinned << PAGE_SHIFT,
 			rlimit(RLIMIT_MEMLOCK));
 }
Alex Williamson Feb. 12, 2019, 6:56 p.m. UTC | #4
On Tue, 12 Feb 2019 17:56:18 +1100
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> On 12/02/2019 09:44, Daniel Jordan wrote:
> > Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
> > pages"), locked and pinned pages are accounted separately.  The SPAPR
> > TCE VFIO IOMMU driver accounts pinned pages to locked_vm; use pinned_vm
> > instead.
> > 
> > pinned_vm recently became atomic and so no longer relies on mmap_sem
> > held as writer: delete.
> > 
> > Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> > ---
> >  Documentation/vfio.txt              |  6 +--
> >  drivers/vfio/vfio_iommu_spapr_tce.c | 64 ++++++++++++++---------------
> >  2 files changed, 33 insertions(+), 37 deletions(-)
> > 
> > diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> > index f1a4d3c3ba0b..fa37d65363f9 100644
> > --- a/Documentation/vfio.txt
> > +++ b/Documentation/vfio.txt
> > @@ -308,7 +308,7 @@ This implementation has some specifics:
> >     currently there is no way to reduce the number of calls. In order to make
> >     things faster, the map/unmap handling has been implemented in real mode
> >     which provides an excellent performance which has limitations such as
> > -   inability to do locked pages accounting in real time.
> > +   inability to do pinned pages accounting in real time.
> >  
> >  4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
> >     subtree that can be treated as a unit for the purposes of partitioning and
> > @@ -324,7 +324,7 @@ This implementation has some specifics:
> >  		returns the size and the start of the DMA window on the PCI bus.
> >  
> >  	VFIO_IOMMU_ENABLE
> > -		enables the container. The locked pages accounting
> > +		enables the container. The pinned pages accounting
> >  		is done at this point. This lets user first to know what
> >  		the DMA window is and adjust rlimit before doing any real job.

I don't know of a ulimit only covering pinned pages, so for
documentation it seems more correct to continue referring to this as
locked page accounting.

> > @@ -454,7 +454,7 @@ This implementation has some specifics:
> >  
> >     PPC64 paravirtualized guests generate a lot of map/unmap requests,
> >     and the handling of those includes pinning/unpinning pages and updating
> > -   mm::locked_vm counter to make sure we do not exceed the rlimit.
> > +   mm::pinned_vm counter to make sure we do not exceed the rlimit.
> >     The v2 IOMMU splits accounting and pinning into separate operations:
> >  
> >     - VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
> > diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> > index c424913324e3..f47e020dc5e4 100644
> > --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> > +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> > @@ -34,9 +34,11 @@
> >  static void tce_iommu_detach_group(void *iommu_data,
> >  		struct iommu_group *iommu_group);
> >  
> > -static long try_increment_locked_vm(struct mm_struct *mm, long npages)
> > +static long try_increment_pinned_vm(struct mm_struct *mm, long npages)
> >  {
> > -	long ret = 0, locked, lock_limit;
> > +	long ret = 0;
> > +	s64 pinned;
> > +	unsigned long lock_limit;
> >  
> >  	if (WARN_ON_ONCE(!mm))
> >  		return -EPERM;
> > @@ -44,39 +46,33 @@ static long try_increment_locked_vm(struct mm_struct *mm, long npages)
> >  	if (!npages)
> >  		return 0;
> >  
> > -	down_write(&mm->mmap_sem);
> > -	locked = mm->locked_vm + npages;
> > +	pinned = atomic64_add_return(npages, &mm->pinned_vm);
> >  	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > -	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
> > +	if (pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
> >  		ret = -ENOMEM;
> > -	else
> > -		mm->locked_vm += npages;
> > +		atomic64_sub(npages, &mm->pinned_vm);
> > +	}
> >  
> > -	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
> > +	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%lu%s\n", current->pid,
> >  			npages << PAGE_SHIFT,
> > -			mm->locked_vm << PAGE_SHIFT,
> > -			rlimit(RLIMIT_MEMLOCK),
> > -			ret ? " - exceeded" : "");
> > -
> > -	up_write(&mm->mmap_sem);
> > +			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
> > +			rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
> >  
> >  	return ret;
> >  }
> >  
> > -static void decrement_locked_vm(struct mm_struct *mm, long npages)
> > +static void decrement_pinned_vm(struct mm_struct *mm, long npages)
> >  {
> >  	if (!mm || !npages)
> >  		return;
> >  
> > -	down_write(&mm->mmap_sem);
> > -	if (WARN_ON_ONCE(npages > mm->locked_vm))
> > -		npages = mm->locked_vm;
> > -	mm->locked_vm -= npages;
> > -	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
> > +	if (WARN_ON_ONCE(npages > atomic64_read(&mm->pinned_vm)))
> > +		npages = atomic64_read(&mm->pinned_vm);
> > +	atomic64_sub(npages, &mm->pinned_vm);
> > +	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%lu\n", current->pid,
> >  			npages << PAGE_SHIFT,
> > -			mm->locked_vm << PAGE_SHIFT,
> > +			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
> >  			rlimit(RLIMIT_MEMLOCK));
> > -	up_write(&mm->mmap_sem);  
> 
> 
> So it used to be down_write+up_write and stuff in between.
> 
> Now it is 3 independent accesses (actually 4 but the last one is
> diagnostic) with no locking around them. Why do not we need a lock
> anymore precisely? Thanks,

The first 2 look pretty sketchy to me, is there a case where you don't
know how many pages you've pinned to unpin them?  And can it ever
really be correct to just unpin whatever remains?  The last access is
diagnostic, which leaves 1.  Daniel's rework to warn on a negative
result looks more sane. Thanks,

Alex
Alexey Kardashevskiy Feb. 13, 2019, 12:34 a.m. UTC | #5
On 13/02/2019 05:56, Alex Williamson wrote:
> On Tue, 12 Feb 2019 17:56:18 +1100
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
>> On 12/02/2019 09:44, Daniel Jordan wrote:
>>> Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
>>> pages"), locked and pinned pages are accounted separately.  The SPAPR
>>> TCE VFIO IOMMU driver accounts pinned pages to locked_vm; use pinned_vm
>>> instead.
>>>
>>> pinned_vm recently became atomic and so no longer relies on mmap_sem
>>> held as writer: delete.
>>>
>>> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
>>> ---
>>>  Documentation/vfio.txt              |  6 +--
>>>  drivers/vfio/vfio_iommu_spapr_tce.c | 64 ++++++++++++++---------------
>>>  2 files changed, 33 insertions(+), 37 deletions(-)
>>>
>>> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>>> index f1a4d3c3ba0b..fa37d65363f9 100644
>>> --- a/Documentation/vfio.txt
>>> +++ b/Documentation/vfio.txt
>>> @@ -308,7 +308,7 @@ This implementation has some specifics:
>>>     currently there is no way to reduce the number of calls. In order to make
>>>     things faster, the map/unmap handling has been implemented in real mode
>>>     which provides an excellent performance which has limitations such as
>>> -   inability to do locked pages accounting in real time.
>>> +   inability to do pinned pages accounting in real time.
>>>  
>>>  4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
>>>     subtree that can be treated as a unit for the purposes of partitioning and
>>> @@ -324,7 +324,7 @@ This implementation has some specifics:
>>>  		returns the size and the start of the DMA window on the PCI bus.
>>>  
>>>  	VFIO_IOMMU_ENABLE
>>> -		enables the container. The locked pages accounting
>>> +		enables the container. The pinned pages accounting
>>>  		is done at this point. This lets user first to know what
>>>  		the DMA window is and adjust rlimit before doing any real job.
> 
> I don't know of a ulimit only covering pinned pages, so for
> documentation it seems more correct to continue referring to this as
> locked page accounting.
> 
>>> @@ -454,7 +454,7 @@ This implementation has some specifics:
>>>  
>>>     PPC64 paravirtualized guests generate a lot of map/unmap requests,
>>>     and the handling of those includes pinning/unpinning pages and updating
>>> -   mm::locked_vm counter to make sure we do not exceed the rlimit.
>>> +   mm::pinned_vm counter to make sure we do not exceed the rlimit.
>>>     The v2 IOMMU splits accounting and pinning into separate operations:
>>>  
>>>     - VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>>> index c424913324e3..f47e020dc5e4 100644
>>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>>> @@ -34,9 +34,11 @@
>>>  static void tce_iommu_detach_group(void *iommu_data,
>>>  		struct iommu_group *iommu_group);
>>>  
>>> -static long try_increment_locked_vm(struct mm_struct *mm, long npages)
>>> +static long try_increment_pinned_vm(struct mm_struct *mm, long npages)
>>>  {
>>> -	long ret = 0, locked, lock_limit;
>>> +	long ret = 0;
>>> +	s64 pinned;
>>> +	unsigned long lock_limit;
>>>  
>>>  	if (WARN_ON_ONCE(!mm))
>>>  		return -EPERM;
>>> @@ -44,39 +46,33 @@ static long try_increment_locked_vm(struct mm_struct *mm, long npages)
>>>  	if (!npages)
>>>  		return 0;
>>>  
>>> -	down_write(&mm->mmap_sem);
>>> -	locked = mm->locked_vm + npages;
>>> +	pinned = atomic64_add_return(npages, &mm->pinned_vm);
>>>  	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>> -	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
>>> +	if (pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
>>>  		ret = -ENOMEM;
>>> -	else
>>> -		mm->locked_vm += npages;
>>> +		atomic64_sub(npages, &mm->pinned_vm);
>>> +	}
>>>  
>>> -	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
>>> +	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%lu%s\n", current->pid,
>>>  			npages << PAGE_SHIFT,
>>> -			mm->locked_vm << PAGE_SHIFT,
>>> -			rlimit(RLIMIT_MEMLOCK),
>>> -			ret ? " - exceeded" : "");
>>> -
>>> -	up_write(&mm->mmap_sem);
>>> +			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
>>> +			rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
>>>  
>>>  	return ret;
>>>  }
>>>  
>>> -static void decrement_locked_vm(struct mm_struct *mm, long npages)
>>> +static void decrement_pinned_vm(struct mm_struct *mm, long npages)
>>>  {
>>>  	if (!mm || !npages)
>>>  		return;
>>>  
>>> -	down_write(&mm->mmap_sem);
>>> -	if (WARN_ON_ONCE(npages > mm->locked_vm))
>>> -		npages = mm->locked_vm;
>>> -	mm->locked_vm -= npages;
>>> -	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
>>> +	if (WARN_ON_ONCE(npages > atomic64_read(&mm->pinned_vm)))
>>> +		npages = atomic64_read(&mm->pinned_vm);
>>> +	atomic64_sub(npages, &mm->pinned_vm);
>>> +	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%lu\n", current->pid,
>>>  			npages << PAGE_SHIFT,
>>> -			mm->locked_vm << PAGE_SHIFT,
>>> +			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
>>>  			rlimit(RLIMIT_MEMLOCK));
>>> -	up_write(&mm->mmap_sem);  
>>
>>
>> So it used to be down_write+up_write and stuff in between.
>>
>> Now it is 3 independent accesses (actually 4 but the last one is
>> diagnostic) with no locking around them. Why do not we need a lock
>> anymore precisely? Thanks,
> 
> The first 2 look pretty sketchy to me, is there a case where you don't
> know how many pages you've pinned to unpin them?

No case like this, this is why WARN_ON_ONCE(). At the time I could have
been under impression that pinned_vm is system-global, hence that
adjustment but we do not really need it there.

>  And can it ever
> really be correct to just unpin whatever remains?  The last access is
> diagnostic, which leaves 1.  Daniel's rework to warn on a negative
> result looks more sane. Thanks,

Yes it does look sane.
Alexey Kardashevskiy Feb. 13, 2019, 12:37 a.m. UTC | #6
On 13/02/2019 04:18, Daniel Jordan wrote:
> On Tue, Feb 12, 2019 at 04:50:11PM +0000, Christopher Lameter wrote:
>> On Tue, 12 Feb 2019, Alexey Kardashevskiy wrote:
>>
>>> Now it is 3 independent accesses (actually 4 but the last one is
>>> diagnostic) with no locking around them. Why do not we need a lock
>>> anymore precisely? Thanks,
>>
>> Updating a regular counter is racy and requires a lock. It was converted
>> to be an atomic which can be incremented without a race.
> 
> Yes, though Alexey may have meant that the multiple reads of the atomic in
> decrement_pinned_vm are racy.

Yes, I meant this race, thanks for clarifying this.

>  It only matters when there's a bug that would
> make the counter go negative, but it's there.
> 
> And FWIW the debug print in try_increment_pinned_vm is also racy.
> 
> This fixes all that.  It doesn't try to correct the negative pinned_vm as the
> old code did because it's already a bug and adjusting the value by the negative
> amount seems to do nothing but make debugging harder.
> 
> If it's ok, I'll respin the whole series this way (another point for common
> helper)

This looks good, thanks for fixing this.


> 
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index f47e020dc5e4..b79257304de6 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -53,25 +53,24 @@ static long try_increment_pinned_vm(struct mm_struct *mm, long npages)
>  		atomic64_sub(npages, &mm->pinned_vm);
>  	}
>  
> -	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%lu%s\n", current->pid,
> -			npages << PAGE_SHIFT,
> -			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
> -			rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
> +	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %lld/%lu%s\n", current->pid,
> +			npages << PAGE_SHIFT, pinned << PAGE_SHIFT,
> +			lock_limit, ret ? " - exceeded" : "");
>  
>  	return ret;
>  }
>  
>  static void decrement_pinned_vm(struct mm_struct *mm, long npages)
>  {
> +	s64 pinned;
> +
>  	if (!mm || !npages)
>  		return;
>  
> -	if (WARN_ON_ONCE(npages > atomic64_read(&mm->pinned_vm)))
> -		npages = atomic64_read(&mm->pinned_vm);
> -	atomic64_sub(npages, &mm->pinned_vm);
> -	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%lu\n", current->pid,
> -			npages << PAGE_SHIFT,
> -			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
> +	pinned = atomic64_sub_return(npages, &mm->pinned_vm);
> +	WARN_ON_ONCE(pinned < 0);
> +	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %lld/%lu\n", current->pid,
> +			npages << PAGE_SHIFT, pinned << PAGE_SHIFT,
>  			rlimit(RLIMIT_MEMLOCK));
>  }
>  
>

Patch
diff mbox series

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index f1a4d3c3ba0b..fa37d65363f9 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -308,7 +308,7 @@  This implementation has some specifics:
    currently there is no way to reduce the number of calls. In order to make
    things faster, the map/unmap handling has been implemented in real mode
    which provides an excellent performance which has limitations such as
-   inability to do locked pages accounting in real time.
+   inability to do pinned pages accounting in real time.
 
 4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
    subtree that can be treated as a unit for the purposes of partitioning and
@@ -324,7 +324,7 @@  This implementation has some specifics:
 		returns the size and the start of the DMA window on the PCI bus.
 
 	VFIO_IOMMU_ENABLE
-		enables the container. The locked pages accounting
+		enables the container. The pinned pages accounting
 		is done at this point. This lets user first to know what
 		the DMA window is and adjust rlimit before doing any real job.
 
@@ -454,7 +454,7 @@  This implementation has some specifics:
 
    PPC64 paravirtualized guests generate a lot of map/unmap requests,
    and the handling of those includes pinning/unpinning pages and updating
-   mm::locked_vm counter to make sure we do not exceed the rlimit.
+   mm::pinned_vm counter to make sure we do not exceed the rlimit.
    The v2 IOMMU splits accounting and pinning into separate operations:
 
    - VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index c424913324e3..f47e020dc5e4 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -34,9 +34,11 @@ 
 static void tce_iommu_detach_group(void *iommu_data,
 		struct iommu_group *iommu_group);
 
-static long try_increment_locked_vm(struct mm_struct *mm, long npages)
+static long try_increment_pinned_vm(struct mm_struct *mm, long npages)
 {
-	long ret = 0, locked, lock_limit;
+	long ret = 0;
+	s64 pinned;
+	unsigned long lock_limit;
 
 	if (WARN_ON_ONCE(!mm))
 		return -EPERM;
@@ -44,39 +46,33 @@  static long try_increment_locked_vm(struct mm_struct *mm, long npages)
 	if (!npages)
 		return 0;
 
-	down_write(&mm->mmap_sem);
-	locked = mm->locked_vm + npages;
+	pinned = atomic64_add_return(npages, &mm->pinned_vm);
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+	if (pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
 		ret = -ENOMEM;
-	else
-		mm->locked_vm += npages;
+		atomic64_sub(npages, &mm->pinned_vm);
+	}
 
-	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
+	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%lu%s\n", current->pid,
 			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK),
-			ret ? " - exceeded" : "");
-
-	up_write(&mm->mmap_sem);
+			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
 
 	return ret;
 }
 
-static void decrement_locked_vm(struct mm_struct *mm, long npages)
+static void decrement_pinned_vm(struct mm_struct *mm, long npages)
 {
 	if (!mm || !npages)
 		return;
 
-	down_write(&mm->mmap_sem);
-	if (WARN_ON_ONCE(npages > mm->locked_vm))
-		npages = mm->locked_vm;
-	mm->locked_vm -= npages;
-	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
+	if (WARN_ON_ONCE(npages > atomic64_read(&mm->pinned_vm)))
+		npages = atomic64_read(&mm->pinned_vm);
+	atomic64_sub(npages, &mm->pinned_vm);
+	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%lu\n", current->pid,
 			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
+			atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
 			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
 }
 
 /*
@@ -110,7 +106,7 @@  struct tce_container {
 	bool enabled;
 	bool v2;
 	bool def_window_pending;
-	unsigned long locked_pages;
+	unsigned long pinned_pages;
 	struct mm_struct *mm;
 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
 	struct list_head group_list;
@@ -283,7 +279,7 @@  static int tce_iommu_find_free_table(struct tce_container *container)
 static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
-	unsigned long locked;
+	unsigned long pinned;
 	struct iommu_table_group *table_group;
 	struct tce_iommu_group *tcegrp;
 
@@ -292,15 +288,15 @@  static int tce_iommu_enable(struct tce_container *container)
 
 	/*
 	 * When userspace pages are mapped into the IOMMU, they are effectively
-	 * locked memory, so, theoretically, we need to update the accounting
-	 * of locked pages on each map and unmap.  For powerpc, the map unmap
+	 * pinned memory, so, theoretically, we need to update the accounting
+	 * of pinned pages on each map and unmap.  For powerpc, the map unmap
 	 * paths can be very hot, though, and the accounting would kill
 	 * performance, especially since it would be difficult to impossible
 	 * to handle the accounting in real mode only.
 	 *
 	 * To address that, rather than precisely accounting every page, we
-	 * instead account for a worst case on locked memory when the iommu is
-	 * enabled and disabled.  The worst case upper bound on locked memory
+	 * instead account for a worst case on pinned memory when the iommu is
+	 * enabled and disabled.  The worst case upper bound on pinned memory
 	 * is the size of the whole iommu window, which is usually relatively
 	 * small (compared to total memory sizes) on POWER hardware.
 	 *
@@ -317,7 +313,7 @@  static int tce_iommu_enable(struct tce_container *container)
 	 *
 	 * So we do not allow enabling a container without a group attached
 	 * as there is no way to know how much we should increment
-	 * the locked_vm counter.
+	 * the pinned_vm counter.
 	 */
 	if (!tce_groups_attached(container))
 		return -ENODEV;
@@ -335,12 +331,12 @@  static int tce_iommu_enable(struct tce_container *container)
 	if (ret)
 		return ret;
 
-	locked = table_group->tce32_size >> PAGE_SHIFT;
-	ret = try_increment_locked_vm(container->mm, locked);
+	pinned = table_group->tce32_size >> PAGE_SHIFT;
+	ret = try_increment_pinned_vm(container->mm, pinned);
 	if (ret)
 		return ret;
 
-	container->locked_pages = locked;
+	container->pinned_pages = pinned;
 
 	container->enabled = true;
 
@@ -355,7 +351,7 @@  static void tce_iommu_disable(struct tce_container *container)
 	container->enabled = false;
 
 	BUG_ON(!container->mm);
-	decrement_locked_vm(container->mm, container->locked_pages);
+	decrement_pinned_vm(container->mm, container->pinned_pages);
 }
 
 static void *tce_iommu_open(unsigned long arg)
@@ -658,7 +654,7 @@  static long tce_iommu_create_table(struct tce_container *container,
 	if (!table_size)
 		return -EINVAL;
 
-	ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
+	ret = try_increment_pinned_vm(container->mm, table_size >> PAGE_SHIFT);
 	if (ret)
 		return ret;
 
@@ -677,7 +673,7 @@  static void tce_iommu_free_table(struct tce_container *container,
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
 	iommu_tce_table_put(tbl);
-	decrement_locked_vm(container->mm, pages);
+	decrement_pinned_vm(container->mm, pages);
 }
 
 static long tce_iommu_create_window(struct tce_container *container,