diff mbox series

回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

Message ID DM4PR12MB5165A2CF1E994C2D4FA35B0D872B9@DM4PR12MB5165.namprd12.prod.outlook.com (mailing list archive)
State New, archived
Headers show
Series 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin | expand

Commit Message

Pan, Xinhui May 19, 2021, 2:39 a.m. UTC
[AMD Official Use Only]

To observe the issue. I made one kfdtest case for debug.
It just alloc a userptr memory and detect if memory is corrupted.
I can hit this failure in 2 minutes. :(

--
2.25.1
diff mbox series

Patch

diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp
index 70c8033..a72f53f 100644
--- a/tests/kfdtest/src/KFDMemoryTest.cpp
+++ b/tests/kfdtest/src/KFDMemoryTest.cpp
@@ -584,6 +584,32 @@  TEST_F(KFDMemoryTest, ZeroMemorySizeAlloc) {
     TEST_END
 }

+TEST_F(KFDMemoryTest, swap) {
+    TEST_START(TESTPROFILE_RUNALL)
+
+    unsigned int size = 128<<20;
+    unsigned int*tmp = (unsigned int *)mmap(0,
+                   size,
+                   PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE,
+                   -1,
+                   0);
+    EXPECT_NE(tmp, MAP_FAILED);
+
+    LOG() << "pls run this with KFDMemoryTest.LargestSysBufferTest" << std::endl;
+    do {
+           memset(tmp, 0xcc, size);
+
+           HsaMemoryBuffer buf(tmp, size);
+           sleep(1);
+           EXPECT_EQ(tmp[0], 0xcccccccc);
+    } while (true);
+
+    munmap(tmp, size);
+
+    TEST_END
+}
+
 // Basic test for hsaKmtAllocMemory
 TEST_F(KFDMemoryTest, MemoryAlloc) {
     TEST_START(TESTPROFILE_RUNALL)
--
2.25.1

________________________________________
发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
发送时间: 2021年5月19日 10:28
收件人: amd-gfx@lists.freedesktop.org
抄送: Kuehling, Felix; Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch; Pan, Xinhui
主题: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

cpu 1                                           cpu 2
kfd alloc BO A(userptr)                         alloc BO B(GTT)
    ->init -> validate                          -> init -> validate -> populate
    init_user_pages                               -> swapout BO A //hit ttm pages limit
        -> get_user_pages (fill up ttm->pages)
         -> validate -> populate
          -> swapin BO A // Now hit the BUG

We know that get_user_pages may race with swapout on same BO.
Threre are some issues I have met.
1) memory corruption.
This is because we do a swap before memory is setup. ttm_tt_swapout()
just create a swap_storage with its content being 0x0. So when we setup
memory after the swapout. The following swapin makes the memory
corrupted.

2) panic
When swapout happes with get_user_pages, they touch ttm->pages without
anylock. It causes memory corruption too. But I hit page fault mostly.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 928e8d57cd08..42460e4480f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -835,6 +835,7 @@  static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
        struct amdkfd_process_info *process_info = mem->process_info;
        struct amdgpu_bo *bo = mem->bo;
        struct ttm_operation_ctx ctx = { true, false };
+       struct page **pages;
        int ret = 0;

        mutex_lock(&process_info->lock);
@@ -852,7 +853,13 @@  static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
                goto out;
        }

-       ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+       pages = kvmalloc_array(bo->tbo.ttm->num_pages,
+                       sizeof(struct page *),
+                       GFP_KERNEL | __GFP_ZERO);
+       if (!pages)
+               goto unregister_out;
+
+       ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
        if (ret) {
                pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
                goto unregister_out;
@@ -863,6 +870,12 @@  static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
                pr_err("%s: Failed to reserve BO\n", __func__);
                goto release_out;
        }
+
+       WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
+
+       memcpy(bo->tbo.ttm->pages,
+                       pages,
+                       sizeof(struct page*) * bo->tbo.ttm->num_pages);
        amdgpu_bo_placement_from_domain(bo, mem->domain);
        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
        if (ret)
@@ -872,6 +885,7 @@  static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 release_out:
        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
 unregister_out:
+       kvfree(pages);
        if (ret)
                amdgpu_mn_unregister(bo);
 out: