diff mbox series

[6/7] nouveau/dmem: Evict device private memory during release

Message ID 072e1ce590fe101a4cdbd5e91b1702efebb6d0fd.1664171943.git-series.apopple@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Fix several device private page reference counting issues | expand

Commit Message

Alistair Popple Sept. 26, 2022, 6:03 a.m. UTC
When the module is unloaded or a GPU is unbound from the module it is
possible for device private pages to be left mapped in currently running
processes. This leads to a kernel crash when the pages are either freed
or accessed from the CPU because the GPU and associated data structures
and callbacks have all been freed.

Fix this by migrating any mappings back to normal CPU memory prior to
freeing the GPU memory chunks and associated device private pages.

Signed-off-by: Alistair Popple <apopple@nvidia.com>

---

I assume the AMD driver might have a similar issue. However I can't see
where device private (or coherent) pages actually get unmapped/freed
during teardown as I couldn't find any relevant calls to
devm_memunmap(), memunmap(), devm_release_mem_region() or
release_mem_region(). So it appears that ZONE_DEVICE pages are not being
properly freed during module unload, unless I'm missing something?
---
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 48 +++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+)

Comments

kernel test robot Sept. 26, 2022, 1:28 p.m. UTC | #1
Hi Alistair,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on 088b8aa537c2c767765f1c19b555f21ffe555786]

url:    https://github.com/intel-lab-lkp/linux/commits/Alistair-Popple/Fix-several-device-private-page-reference-counting-issues/20220926-140640
base:   088b8aa537c2c767765f1c19b555f21ffe555786
config: arm64-allyesconfig
compiler: aarch64-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/e99dcd1c3eb07d0787b26d7df178659aec9e3c74
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Alistair-Popple/Fix-several-device-private-page-reference-counting-issues/20220926-140640
        git checkout e99dcd1c3eb07d0787b26d7df178659aec9e3c74
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arm64 SHELL=/bin/bash drivers/gpu/drm/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/nouveau/nouveau_dmem.c:376:1: warning: no previous prototype for 'nouveau_dmem_evict_chunk' [-Wmissing-prototypes]
     376 | nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
         | ^~~~~~~~~~~~~~~~~~~~~~~~


vim +/nouveau_dmem_evict_chunk +376 drivers/gpu/drm/nouveau/nouveau_dmem.c

   371	
   372	/*
   373	 * Evict all pages mapping a chunk.
   374	 */
   375	void
 > 376	nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
   377	{
   378		unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
   379		unsigned long *src_pfns, *dst_pfns;
   380		dma_addr_t *dma_addrs;
   381		struct nouveau_fence *fence;
   382	
   383		src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
   384		dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
   385		dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
   386	
   387		migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
   388				npages);
   389	
   390		for (i = 0; i < npages; i++) {
   391			if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
   392				struct page *dpage;
   393	
   394				/*
   395				 * _GFP_NOFAIL because the GPU is going away and there
   396				 * is nothing sensible we can do if we can't copy the
   397				 * data back.
   398				 */
   399				dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
   400				dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
   401				nouveau_dmem_copy_one(chunk->drm,
   402						migrate_pfn_to_page(src_pfns[i]), dpage,
   403						&dma_addrs[i]);
   404			}
   405		}
   406	
   407		nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
   408		migrate_device_pages(src_pfns, dst_pfns, npages);
   409		nouveau_dmem_fence_done(&fence);
   410		migrate_device_finalize(src_pfns, dst_pfns, npages);
   411		kfree(src_pfns);
   412		kfree(dst_pfns);
   413		for (i = 0; i < npages; i++)
   414			dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
   415		kfree(dma_addrs);
   416	}
   417
Lyude Paul Sept. 26, 2022, 9:35 p.m. UTC | #2
On Mon, 2022-09-26 at 16:03 +1000, Alistair Popple wrote:
> When the module is unloaded or a GPU is unbound from the module it is
> possible for device private pages to be left mapped in currently running
> processes. This leads to a kernel crash when the pages are either freed
> or accessed from the CPU because the GPU and associated data structures
> and callbacks have all been freed.
> 
> Fix this by migrating any mappings back to normal CPU memory prior to
> freeing the GPU memory chunks and associated device private pages.
> 
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> 
> ---
> 
> I assume the AMD driver might have a similar issue. However I can't see
> where device private (or coherent) pages actually get unmapped/freed
> during teardown as I couldn't find any relevant calls to
> devm_memunmap(), memunmap(), devm_release_mem_region() or
> release_mem_region(). So it appears that ZONE_DEVICE pages are not being
> properly freed during module unload, unless I'm missing something?

I've got no idea, will poke Ben to see if they know the answer to this

> ---
>  drivers/gpu/drm/nouveau/nouveau_dmem.c | 48 +++++++++++++++++++++++++++-
>  1 file changed, 48 insertions(+)
> 
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> index 66ebbd4..3b247b8 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> @@ -369,6 +369,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm)
>  	mutex_unlock(&drm->dmem->mutex);
>  }
>  
> +/*
> + * Evict all pages mapping a chunk.
> + */
> +void
> +nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
> +{
> +	unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
> +	unsigned long *src_pfns, *dst_pfns;
> +	dma_addr_t *dma_addrs;
> +	struct nouveau_fence *fence;
> +
> +	src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
> +	dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
> +	dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
> +
> +	migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
> +			npages);
> +
> +	for (i = 0; i < npages; i++) {
> +		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
> +			struct page *dpage;
> +
> +			/*
> +			 * _GFP_NOFAIL because the GPU is going away and there
> +			 * is nothing sensible we can do if we can't copy the
> +			 * data back.
> +			 */

You'll have to excuse me for a moment since this area of nouveau isn't one of
my strongpoints, but are we sure about this? IIRC __GFP_NOFAIL means infinite
retry, in the case of a GPU hotplug event I would assume we would rather just
stop trying to migrate things to the GPU and just drop the data instead of
hanging on infinite retries.

> +			dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
> +			dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
> +			nouveau_dmem_copy_one(chunk->drm,
> +					migrate_pfn_to_page(src_pfns[i]), dpage,
> +					&dma_addrs[i]);
> +		}
> +	}
> +
> +	nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
> +	migrate_device_pages(src_pfns, dst_pfns, npages);
> +	nouveau_dmem_fence_done(&fence);
> +	migrate_device_finalize(src_pfns, dst_pfns, npages);
> +	kfree(src_pfns);
> +	kfree(dst_pfns);
> +	for (i = 0; i < npages; i++)
> +		dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
> +	kfree(dma_addrs);
> +}
> +
>  void
>  nouveau_dmem_fini(struct nouveau_drm *drm)
>  {
> @@ -380,8 +426,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
>  	mutex_lock(&drm->dmem->mutex);
>  
>  	list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
> +		nouveau_dmem_evict_chunk(chunk);
>  		nouveau_bo_unpin(chunk->bo);
>  		nouveau_bo_ref(NULL, &chunk->bo);
> +		WARN_ON(chunk->callocated);
>  		list_del(&chunk->list);
>  		memunmap_pages(&chunk->pagemap);
>  		release_mem_region(chunk->pagemap.range.start,
John Hubbard Sept. 26, 2022, 10:14 p.m. UTC | #3
On 9/26/22 14:35, Lyude Paul wrote:
>> +	for (i = 0; i < npages; i++) {
>> +		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
>> +			struct page *dpage;
>> +
>> +			/*
>> +			 * _GFP_NOFAIL because the GPU is going away and there
>> +			 * is nothing sensible we can do if we can't copy the
>> +			 * data back.
>> +			 */
> 
> You'll have to excuse me for a moment since this area of nouveau isn't one of
> my strongpoints, but are we sure about this? IIRC __GFP_NOFAIL means infinite
> retry, in the case of a GPU hotplug event I would assume we would rather just
> stop trying to migrate things to the GPU and just drop the data instead of
> hanging on infinite retries.
> 
Hi Lyude!

Actually, I really think it's better in this case to keep trying
(presumably not necessarily infinitely, but only until memory becomes
available), rather than failing out and corrupting data.

That's because I'm not sure it's completely clear that this memory is
discardable. And at some point, we're going to make this all work with
file-backed memory, which will *definitely* not be discardable--I
realize that we're not there yet, of course.

But here, it's reasonable to commit to just retrying indefinitely,
really. Memory should eventually show up. And if it doesn't, then
restarting the machine is better than corrupting data, generally.


thanks,
Felix Kuehling Sept. 26, 2022, 11:07 p.m. UTC | #4
On 2022-09-26 17:35, Lyude Paul wrote:
> On Mon, 2022-09-26 at 16:03 +1000, Alistair Popple wrote:
>> When the module is unloaded or a GPU is unbound from the module it is
>> possible for device private pages to be left mapped in currently running
>> processes. This leads to a kernel crash when the pages are either freed
>> or accessed from the CPU because the GPU and associated data structures
>> and callbacks have all been freed.
>>
>> Fix this by migrating any mappings back to normal CPU memory prior to
>> freeing the GPU memory chunks and associated device private pages.
>>
>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>>
>> ---
>>
>> I assume the AMD driver might have a similar issue. However I can't see
>> where device private (or coherent) pages actually get unmapped/freed
>> during teardown as I couldn't find any relevant calls to
>> devm_memunmap(), memunmap(), devm_release_mem_region() or
>> release_mem_region(). So it appears that ZONE_DEVICE pages are not being
>> properly freed during module unload, unless I'm missing something?
> I've got no idea, will poke Ben to see if they know the answer to this

I guess we're relying on devm to release the region. Isn't the whole 
point of using devm_request_free_mem_region that we don't have to 
remember to explicitly release it when the device gets destroyed? I 
believe we had an explicit free call at some point by mistake, and that 
caused a double-free during module unload. See this commit for reference:

commit 22f4f4faf337d5fb2d2750aff13215726814273e
Author: Philip Yang <Philip.Yang@amd.com>
Date:   Mon Sep 20 17:25:52 2021 -0400

     drm/amdkfd: fix svm_migrate_fini warning
     
     Device manager releases device-specific resources when a driver
     disconnects from a device, devm_memunmap_pages and
     devm_release_mem_region calls in svm_migrate_fini are redundant.
     
     It causes below warning trace after patch "drm/amdgpu: Split
     amdgpu_device_fini into early and late", so remove function
     svm_migrate_fini.
     
     BUG: https://gitlab.freedesktop.org/drm/amd/-/issues/1718
     
     WARNING: CPU: 1 PID: 3646 at drivers/base/devres.c:795
     devm_release_action+0x51/0x60
     Call Trace:
         ? memunmap_pages+0x360/0x360
         svm_migrate_fini+0x2d/0x60 [amdgpu]
         kgd2kfd_device_exit+0x23/0xa0 [amdgpu]
         amdgpu_amdkfd_device_fini_sw+0x1d/0x30 [amdgpu]
         amdgpu_device_fini_sw+0x45/0x290 [amdgpu]
         amdgpu_driver_release_kms+0x12/0x30 [amdgpu]
         drm_dev_release+0x20/0x40 [drm]
         release_nodes+0x196/0x1e0
         device_release_driver_internal+0x104/0x1d0
         driver_detach+0x47/0x90
         bus_remove_driver+0x7a/0xd0
         pci_unregister_driver+0x3d/0x90
         amdgpu_exit+0x11/0x20 [amdgpu]
     
     Signed-off-by: Philip Yang <Philip.Yang@amd.com>
     Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
     Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

Furthermore, I guess we are assuming that nobody is using the GPU when 
the module is unloaded. As long as any processes have /dev/kfd open, you 
won't be able to unload the module (except by force-unload). I suppose 
with ZONE_DEVICE memory, we can have references to device memory pages 
even when user mode has closed /dev/kfd. We do have a cleanup handler 
that runs in an MMU-free-notifier. In theory that should run after all 
the pages in the mm_struct have been freed. It releases all sorts of 
other device resources and needs the driver to still be there. I'm not 
sure if there is anything preventing a module unload before the 
free-notifier runs. I'll look into that.

Regards,
   Felix


>
>> ---
>>   drivers/gpu/drm/nouveau/nouveau_dmem.c | 48 +++++++++++++++++++++++++++-
>>   1 file changed, 48 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
>> index 66ebbd4..3b247b8 100644
>> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
>> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
>> @@ -369,6 +369,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm)
>>   	mutex_unlock(&drm->dmem->mutex);
>>   }
>>   
>> +/*
>> + * Evict all pages mapping a chunk.
>> + */
>> +void
>> +nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
>> +{
>> +	unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
>> +	unsigned long *src_pfns, *dst_pfns;
>> +	dma_addr_t *dma_addrs;
>> +	struct nouveau_fence *fence;
>> +
>> +	src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
>> +	dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
>> +	dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
>> +
>> +	migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
>> +			npages);
>> +
>> +	for (i = 0; i < npages; i++) {
>> +		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
>> +			struct page *dpage;
>> +
>> +			/*
>> +			 * _GFP_NOFAIL because the GPU is going away and there
>> +			 * is nothing sensible we can do if we can't copy the
>> +			 * data back.
>> +			 */
> You'll have to excuse me for a moment since this area of nouveau isn't one of
> my strongpoints, but are we sure about this? IIRC __GFP_NOFAIL means infinite
> retry, in the case of a GPU hotplug event I would assume we would rather just
> stop trying to migrate things to the GPU and just drop the data instead of
> hanging on infinite retries.
>
>> +			dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
>> +			dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
>> +			nouveau_dmem_copy_one(chunk->drm,
>> +					migrate_pfn_to_page(src_pfns[i]), dpage,
>> +					&dma_addrs[i]);
>> +		}
>> +	}
>> +
>> +	nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
>> +	migrate_device_pages(src_pfns, dst_pfns, npages);
>> +	nouveau_dmem_fence_done(&fence);
>> +	migrate_device_finalize(src_pfns, dst_pfns, npages);
>> +	kfree(src_pfns);
>> +	kfree(dst_pfns);
>> +	for (i = 0; i < npages; i++)
>> +		dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
>> +	kfree(dma_addrs);
>> +}
>> +
>>   void
>>   nouveau_dmem_fini(struct nouveau_drm *drm)
>>   {
>> @@ -380,8 +426,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
>>   	mutex_lock(&drm->dmem->mutex);
>>   
>>   	list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
>> +		nouveau_dmem_evict_chunk(chunk);
>>   		nouveau_bo_unpin(chunk->bo);
>>   		nouveau_bo_ref(NULL, &chunk->bo);
>> +		WARN_ON(chunk->callocated);
>>   		list_del(&chunk->list);
>>   		memunmap_pages(&chunk->pagemap);
>>   		release_mem_region(chunk->pagemap.range.start,
Alistair Popple Sept. 26, 2022, 11:45 p.m. UTC | #5
John Hubbard <jhubbard@nvidia.com> writes:

> On 9/26/22 14:35, Lyude Paul wrote:
>>> +	for (i = 0; i < npages; i++) {
>>> +		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
>>> +			struct page *dpage;
>>> +
>>> +			/*
>>> +			 * _GFP_NOFAIL because the GPU is going away and there
>>> +			 * is nothing sensible we can do if we can't copy the
>>> +			 * data back.
>>> +			 */
>>
>> You'll have to excuse me for a moment since this area of nouveau isn't one of
>> my strongpoints, but are we sure about this? IIRC __GFP_NOFAIL means infinite
>> retry, in the case of a GPU hotplug event I would assume we would rather just
>> stop trying to migrate things to the GPU and just drop the data instead of
>> hanging on infinite retries.
>>

No problem, thanks for taking a look!

> Hi Lyude!
>
> Actually, I really think it's better in this case to keep trying
> (presumably not necessarily infinitely, but only until memory becomes
> available), rather than failing out and corrupting data.
>
> That's because I'm not sure it's completely clear that this memory is
> discardable. And at some point, we're going to make this all work with
> file-backed memory, which will *definitely* not be discardable--I
> realize that we're not there yet, of course.
>
> But here, it's reasonable to commit to just retrying indefinitely,
> really. Memory should eventually show up. And if it doesn't, then
> restarting the machine is better than corrupting data, generally.

The memory is definitely not discardable here if the migration failed
because that implies it is still mapped into some userspace process.

We could avoid restarting the machine by doing something similar to what
happens during memory failure and killing every process that maps the
page(s). But overall I think it's better to retry until memory is
available, because that allows things like reclaim to work and in the
worst case allows the OOM killer to select an appropriate task to kill.
It also won't cause data corruption if/when we have file-backed memory.

> thanks,
Alistair Popple Sept. 27, 2022, 1:39 a.m. UTC | #6
Felix Kuehling <felix.kuehling@amd.com> writes:

> On 2022-09-26 17:35, Lyude Paul wrote:
>> On Mon, 2022-09-26 at 16:03 +1000, Alistair Popple wrote:
>>> When the module is unloaded or a GPU is unbound from the module it is
>>> possible for device private pages to be left mapped in currently running
>>> processes. This leads to a kernel crash when the pages are either freed
>>> or accessed from the CPU because the GPU and associated data structures
>>> and callbacks have all been freed.
>>>
>>> Fix this by migrating any mappings back to normal CPU memory prior to
>>> freeing the GPU memory chunks and associated device private pages.
>>>
>>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>>>
>>> ---
>>>
>>> I assume the AMD driver might have a similar issue. However I can't see
>>> where device private (or coherent) pages actually get unmapped/freed
>>> during teardown as I couldn't find any relevant calls to
>>> devm_memunmap(), memunmap(), devm_release_mem_region() or
>>> release_mem_region(). So it appears that ZONE_DEVICE pages are not being
>>> properly freed during module unload, unless I'm missing something?
>> I've got no idea, will poke Ben to see if they know the answer to this
>
> I guess we're relying on devm to release the region. Isn't the whole point of
> using devm_request_free_mem_region that we don't have to remember to explicitly
> release it when the device gets destroyed? I believe we had an explicit free
> call at some point by mistake, and that caused a double-free during module
> unload. See this commit for reference:

Argh, thanks for that pointer. I was not so familiar with
devm_request_free_mem_region()/devm_memremap_pages() as currently
Nouveau explicitly manages that itself.

> commit 22f4f4faf337d5fb2d2750aff13215726814273e
> Author: Philip Yang <Philip.Yang@amd.com>
> Date:   Mon Sep 20 17:25:52 2021 -0400
>
>     drm/amdkfd: fix svm_migrate_fini warning
>          Device manager releases device-specific resources when a driver
>     disconnects from a device, devm_memunmap_pages and
>     devm_release_mem_region calls in svm_migrate_fini are redundant.
>          It causes below warning trace after patch "drm/amdgpu: Split
>     amdgpu_device_fini into early and late", so remove function
>     svm_migrate_fini.
>          BUG: https://gitlab.freedesktop.org/drm/amd/-/issues/1718
>          WARNING: CPU: 1 PID: 3646 at drivers/base/devres.c:795
>     devm_release_action+0x51/0x60
>     Call Trace:
>         ? memunmap_pages+0x360/0x360
>         svm_migrate_fini+0x2d/0x60 [amdgpu]
>         kgd2kfd_device_exit+0x23/0xa0 [amdgpu]
>         amdgpu_amdkfd_device_fini_sw+0x1d/0x30 [amdgpu]
>         amdgpu_device_fini_sw+0x45/0x290 [amdgpu]
>         amdgpu_driver_release_kms+0x12/0x30 [amdgpu]
>         drm_dev_release+0x20/0x40 [drm]
>         release_nodes+0x196/0x1e0
>         device_release_driver_internal+0x104/0x1d0
>         driver_detach+0x47/0x90
>         bus_remove_driver+0x7a/0xd0
>         pci_unregister_driver+0x3d/0x90
>         amdgpu_exit+0x11/0x20 [amdgpu]
>          Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>     Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>     Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>
> Furthermore, I guess we are assuming that nobody is using the GPU when the
> module is unloaded. As long as any processes have /dev/kfd open, you won't be
> able to unload the module (except by force-unload). I suppose with ZONE_DEVICE
> memory, we can have references to device memory pages even when user mode has
> closed /dev/kfd. We do have a cleanup handler that runs in an MMU-free-notifier.
> In theory that should run after all the pages in the mm_struct have been freed.
> It releases all sorts of other device resources and needs the driver to still be
> there. I'm not sure if there is anything preventing a module unload before the
> free-notifier runs. I'll look into that.

Right - module unload (or device unbind) is one of the other ways we can
hit this issue in Nouveau at least. You can end up with ZONE_DEVICE
pages mapped in a running process after the module has unloaded.
Although now you mention it that seems a bit wrong - the pgmap refcount
should provide some protection against that. Will have to look into
that too.

> Regards,
>   Felix
>
>
>>
>>> ---
>>>   drivers/gpu/drm/nouveau/nouveau_dmem.c | 48 +++++++++++++++++++++++++++-
>>>   1 file changed, 48 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
>>> index 66ebbd4..3b247b8 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
>>> @@ -369,6 +369,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm)
>>>   	mutex_unlock(&drm->dmem->mutex);
>>>   }
>>>   +/*
>>> + * Evict all pages mapping a chunk.
>>> + */
>>> +void
>>> +nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
>>> +{
>>> +	unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
>>> +	unsigned long *src_pfns, *dst_pfns;
>>> +	dma_addr_t *dma_addrs;
>>> +	struct nouveau_fence *fence;
>>> +
>>> +	src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
>>> +	dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
>>> +	dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
>>> +
>>> +	migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
>>> +			npages);
>>> +
>>> +	for (i = 0; i < npages; i++) {
>>> +		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
>>> +			struct page *dpage;
>>> +
>>> +			/*
>>> +			 * _GFP_NOFAIL because the GPU is going away and there
>>> +			 * is nothing sensible we can do if we can't copy the
>>> +			 * data back.
>>> +			 */
>> You'll have to excuse me for a moment since this area of nouveau isn't one of
>> my strongpoints, but are we sure about this? IIRC __GFP_NOFAIL means infinite
>> retry, in the case of a GPU hotplug event I would assume we would rather just
>> stop trying to migrate things to the GPU and just drop the data instead of
>> hanging on infinite retries.
>>
>>> +			dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
>>> +			dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
>>> +			nouveau_dmem_copy_one(chunk->drm,
>>> +					migrate_pfn_to_page(src_pfns[i]), dpage,
>>> +					&dma_addrs[i]);
>>> +		}
>>> +	}
>>> +
>>> +	nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
>>> +	migrate_device_pages(src_pfns, dst_pfns, npages);
>>> +	nouveau_dmem_fence_done(&fence);
>>> +	migrate_device_finalize(src_pfns, dst_pfns, npages);
>>> +	kfree(src_pfns);
>>> +	kfree(dst_pfns);
>>> +	for (i = 0; i < npages; i++)
>>> +		dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
>>> +	kfree(dma_addrs);
>>> +}
>>> +
>>>   void
>>>   nouveau_dmem_fini(struct nouveau_drm *drm)
>>>   {
>>> @@ -380,8 +426,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
>>>   	mutex_lock(&drm->dmem->mutex);
>>>     	list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
>>> +		nouveau_dmem_evict_chunk(chunk);
>>>   		nouveau_bo_unpin(chunk->bo);
>>>   		nouveau_bo_ref(NULL, &chunk->bo);
>>> +		WARN_ON(chunk->callocated);
>>>   		list_del(&chunk->list);
>>>   		memunmap_pages(&chunk->pagemap);
>>>   		release_mem_region(chunk->pagemap.range.start,
Lyude Paul Sept. 28, 2022, 9:23 p.m. UTC | #7
On Tue, 2022-09-27 at 11:39 +1000, Alistair Popple wrote:
> Felix Kuehling <felix.kuehling@amd.com> writes:
> 
> > On 2022-09-26 17:35, Lyude Paul wrote:
> > > On Mon, 2022-09-26 at 16:03 +1000, Alistair Popple wrote:
> > > > When the module is unloaded or a GPU is unbound from the module it is
> > > > possible for device private pages to be left mapped in currently running
> > > > processes. This leads to a kernel crash when the pages are either freed
> > > > or accessed from the CPU because the GPU and associated data structures
> > > > and callbacks have all been freed.
> > > > 
> > > > Fix this by migrating any mappings back to normal CPU memory prior to
> > > > freeing the GPU memory chunks and associated device private pages.
> > > > 
> > > > Signed-off-by: Alistair Popple <apopple@nvidia.com>
> > > > 
> > > > ---
> > > > 
> > > > I assume the AMD driver might have a similar issue. However I can't see
> > > > where device private (or coherent) pages actually get unmapped/freed
> > > > during teardown as I couldn't find any relevant calls to
> > > > devm_memunmap(), memunmap(), devm_release_mem_region() or
> > > > release_mem_region(). So it appears that ZONE_DEVICE pages are not being
> > > > properly freed during module unload, unless I'm missing something?
> > > I've got no idea, will poke Ben to see if they know the answer to this
> > 
> > I guess we're relying on devm to release the region. Isn't the whole point of
> > using devm_request_free_mem_region that we don't have to remember to explicitly
> > release it when the device gets destroyed? I believe we had an explicit free
> > call at some point by mistake, and that caused a double-free during module
> > unload. See this commit for reference:
> 
> Argh, thanks for that pointer. I was not so familiar with
> devm_request_free_mem_region()/devm_memremap_pages() as currently
> Nouveau explicitly manages that itself.

Mhm, TBH I feel like this was going to happen eventually anyway but there's
another reason for nouveau to start using the managed versions of these
functions at some point

> 
> > commit 22f4f4faf337d5fb2d2750aff13215726814273e
> > Author: Philip Yang <Philip.Yang@amd.com>
> > Date:   Mon Sep 20 17:25:52 2021 -0400
> > 
> >     drm/amdkfd: fix svm_migrate_fini warning
> >          Device manager releases device-specific resources when a driver
> >     disconnects from a device, devm_memunmap_pages and
> >     devm_release_mem_region calls in svm_migrate_fini are redundant.
> >          It causes below warning trace after patch "drm/amdgpu: Split
> >     amdgpu_device_fini into early and late", so remove function
> >     svm_migrate_fini.
> >          BUG: https://gitlab.freedesktop.org/drm/amd/-/issues/1718
> >          WARNING: CPU: 1 PID: 3646 at drivers/base/devres.c:795
> >     devm_release_action+0x51/0x60
> >     Call Trace:
> >         ? memunmap_pages+0x360/0x360
> >         svm_migrate_fini+0x2d/0x60 [amdgpu]
> >         kgd2kfd_device_exit+0x23/0xa0 [amdgpu]
> >         amdgpu_amdkfd_device_fini_sw+0x1d/0x30 [amdgpu]
> >         amdgpu_device_fini_sw+0x45/0x290 [amdgpu]
> >         amdgpu_driver_release_kms+0x12/0x30 [amdgpu]
> >         drm_dev_release+0x20/0x40 [drm]
> >         release_nodes+0x196/0x1e0
> >         device_release_driver_internal+0x104/0x1d0
> >         driver_detach+0x47/0x90
> >         bus_remove_driver+0x7a/0xd0
> >         pci_unregister_driver+0x3d/0x90
> >         amdgpu_exit+0x11/0x20 [amdgpu]
> >          Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> >     Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> >     Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> > 
> > Furthermore, I guess we are assuming that nobody is using the GPU when the
> > module is unloaded. As long as any processes have /dev/kfd open, you won't be
> > able to unload the module (except by force-unload). I suppose with ZONE_DEVICE
> > memory, we can have references to device memory pages even when user mode has
> > closed /dev/kfd. We do have a cleanup handler that runs in an MMU-free-notifier.
> > In theory that should run after all the pages in the mm_struct have been freed.
> > It releases all sorts of other device resources and needs the driver to still be
> > there. I'm not sure if there is anything preventing a module unload before the
> > free-notifier runs. I'll look into that.
> 
> Right - module unload (or device unbind) is one of the other ways we can
> hit this issue in Nouveau at least. You can end up with ZONE_DEVICE
> pages mapped in a running process after the module has unloaded.
> Although now you mention it that seems a bit wrong - the pgmap refcount
> should provide some protection against that. Will have to look into
> that too.
> 
> > Regards,
> >   Felix
> > 
> > 
> > > 
> > > > ---
> > > >   drivers/gpu/drm/nouveau/nouveau_dmem.c | 48 +++++++++++++++++++++++++++-
> > > >   1 file changed, 48 insertions(+)
> > > > 
> > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> > > > index 66ebbd4..3b247b8 100644
> > > > --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> > > > +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> > > > @@ -369,6 +369,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm)
> > > >   	mutex_unlock(&drm->dmem->mutex);
> > > >   }
> > > >   +/*
> > > > + * Evict all pages mapping a chunk.
> > > > + */
> > > > +void
> > > > +nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
> > > > +{
> > > > +	unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
> > > > +	unsigned long *src_pfns, *dst_pfns;
> > > > +	dma_addr_t *dma_addrs;
> > > > +	struct nouveau_fence *fence;
> > > > +
> > > > +	src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
> > > > +	dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
> > > > +	dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
> > > > +
> > > > +	migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
> > > > +			npages);
> > > > +
> > > > +	for (i = 0; i < npages; i++) {
> > > > +		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
> > > > +			struct page *dpage;
> > > > +
> > > > +			/*
> > > > +			 * _GFP_NOFAIL because the GPU is going away and there
> > > > +			 * is nothing sensible we can do if we can't copy the
> > > > +			 * data back.
> > > > +			 */
> > > You'll have to excuse me for a moment since this area of nouveau isn't one of
> > > my strongpoints, but are we sure about this? IIRC __GFP_NOFAIL means infinite
> > > retry, in the case of a GPU hotplug event I would assume we would rather just
> > > stop trying to migrate things to the GPU and just drop the data instead of
> > > hanging on infinite retries.
> > > 
> > > > +			dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
> > > > +			dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
> > > > +			nouveau_dmem_copy_one(chunk->drm,
> > > > +					migrate_pfn_to_page(src_pfns[i]), dpage,
> > > > +					&dma_addrs[i]);
> > > > +		}
> > > > +	}
> > > > +
> > > > +	nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
> > > > +	migrate_device_pages(src_pfns, dst_pfns, npages);
> > > > +	nouveau_dmem_fence_done(&fence);
> > > > +	migrate_device_finalize(src_pfns, dst_pfns, npages);
> > > > +	kfree(src_pfns);
> > > > +	kfree(dst_pfns);
> > > > +	for (i = 0; i < npages; i++)
> > > > +		dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
> > > > +	kfree(dma_addrs);
> > > > +}
> > > > +
> > > >   void
> > > >   nouveau_dmem_fini(struct nouveau_drm *drm)
> > > >   {
> > > > @@ -380,8 +426,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
> > > >   	mutex_lock(&drm->dmem->mutex);
> > > >     	list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
> > > > +		nouveau_dmem_evict_chunk(chunk);
> > > >   		nouveau_bo_unpin(chunk->bo);
> > > >   		nouveau_bo_ref(NULL, &chunk->bo);
> > > > +		WARN_ON(chunk->callocated);
> > > >   		list_del(&chunk->list);
> > > >   		memunmap_pages(&chunk->pagemap);
> > > >   		release_mem_region(chunk->pagemap.range.start,
>
Lyude Paul Sept. 28, 2022, 9:39 p.m. UTC | #8
Re comments about infinite retry: gotcha, makes sense to me.

On Tue, 2022-09-27 at 09:45 +1000, Alistair Popple wrote:
> John Hubbard <jhubbard@nvidia.com> writes:
> 
> > On 9/26/22 14:35, Lyude Paul wrote:
> > > > +	for (i = 0; i < npages; i++) {
> > > > +		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
> > > > +			struct page *dpage;
> > > > +
> > > > +			/*
> > > > +			 * _GFP_NOFAIL because the GPU is going away and there
> > > > +			 * is nothing sensible we can do if we can't copy the
> > > > +			 * data back.
> > > > +			 */
> > > 
> > > You'll have to excuse me for a moment since this area of nouveau isn't one of
> > > my strongpoints, but are we sure about this? IIRC __GFP_NOFAIL means infinite
> > > retry, in the case of a GPU hotplug event I would assume we would rather just
> > > stop trying to migrate things to the GPU and just drop the data instead of
> > > hanging on infinite retries.
> > > 
> 
> No problem, thanks for taking a look!
> 
> > Hi Lyude!
> > 
> > Actually, I really think it's better in this case to keep trying
> > (presumably not necessarily infinitely, but only until memory becomes
> > available), rather than failing out and corrupting data.
> > 
> > That's because I'm not sure it's completely clear that this memory is
> > discardable. And at some point, we're going to make this all work with
> > file-backed memory, which will *definitely* not be discardable--I
> > realize that we're not there yet, of course.
> > 
> > But here, it's reasonable to commit to just retrying indefinitely,
> > really. Memory should eventually show up. And if it doesn't, then
> > restarting the machine is better than corrupting data, generally.
> 
> The memory is definitely not discardable here if the migration failed
> because that implies it is still mapped into some userspace process.
> 
> We could avoid restarting the machine by doing something similar to what
> happens during memory failure and killing every process that maps the
> page(s). But overall I think it's better to retry until memory is
> available, because that allows things like reclaim to work and in the
> worst case allows the OOM killer to select an appropriate task to kill.
> It also won't cause data corruption if/when we have file-backed memory.
> 
> > thanks,
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 66ebbd4..3b247b8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -369,6 +369,52 @@  nouveau_dmem_suspend(struct nouveau_drm *drm)
 	mutex_unlock(&drm->dmem->mutex);
 }
 
+/*
+ * Evict all pages mapping a chunk.
+ */
+void
+nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
+{
+	unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
+	unsigned long *src_pfns, *dst_pfns;
+	dma_addr_t *dma_addrs;
+	struct nouveau_fence *fence;
+
+	src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
+	dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
+	dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
+
+	migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
+			npages);
+
+	for (i = 0; i < npages; i++) {
+		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
+			struct page *dpage;
+
+			/*
+			 * _GFP_NOFAIL because the GPU is going away and there
+			 * is nothing sensible we can do if we can't copy the
+			 * data back.
+			 */
+			dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
+			dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
+			nouveau_dmem_copy_one(chunk->drm,
+					migrate_pfn_to_page(src_pfns[i]), dpage,
+					&dma_addrs[i]);
+		}
+	}
+
+	nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
+	migrate_device_pages(src_pfns, dst_pfns, npages);
+	nouveau_dmem_fence_done(&fence);
+	migrate_device_finalize(src_pfns, dst_pfns, npages);
+	kfree(src_pfns);
+	kfree(dst_pfns);
+	for (i = 0; i < npages; i++)
+		dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
+	kfree(dma_addrs);
+}
+
 void
 nouveau_dmem_fini(struct nouveau_drm *drm)
 {
@@ -380,8 +426,10 @@  nouveau_dmem_fini(struct nouveau_drm *drm)
 	mutex_lock(&drm->dmem->mutex);
 
 	list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
+		nouveau_dmem_evict_chunk(chunk);
 		nouveau_bo_unpin(chunk->bo);
 		nouveau_bo_ref(NULL, &chunk->bo);
+		WARN_ON(chunk->callocated);
 		list_del(&chunk->list);
 		memunmap_pages(&chunk->pagemap);
 		release_mem_region(chunk->pagemap.range.start,