[10/22] drm/xe/svm: Introduce svm migration function

Message ID	20231221043812.3783313-11-oak.zeng@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <dri-devel-bounces@lists.freedesktop.org> From: Oak Zeng <oak.zeng@intel.com> To: dri-devel@lists.freedesktop.org, intel-xe@lists.freedesktop.org Subject: [PATCH 10/22] drm/xe/svm: Introduce svm migration function Date: Wed, 20 Dec 2023 23:38:00 -0500 Message-Id: <20231221043812.3783313-11-oak.zeng@intel.com> In-Reply-To: <20231221043812.3783313-1-oak.zeng@intel.com> References: <20231221043812.3783313-1-oak.zeng@intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Precedence: list Cc: matthew.brost@intel.com, Thomas.Hellstrom@linux.intel.com, niranjana.vishwanathapura@intel.com, brian.welty@intel.com Errors-To: dri-devel-bounces@lists.freedesktop.org Sender: "dri-devel" <dri-devel-bounces@lists.freedesktop.org>
Series	XeKmd basic SVM support \| expand [00/22] XeKmd basic SVM support [01/22] drm/xe/svm: Add SVM document [02/22] drm/xe/svm: Add svm key data structures [03/22] drm/xe/svm: create xe svm during vm creation [04/22] drm/xe/svm: Trace svm creation [05/22] drm/xe/svm: add helper to retrieve svm range from address [06/22] drm/xe/svm: Introduce a helper to build sg table from hmm range [07/22] drm/xe/svm: Add helper for binding hmm range to gpu [08/22] drm/xe/svm: Add helper to invalidate svm range from GPU [09/22] drm/xe/svm: Remap and provide memmap backing for GPU vram [10/22] drm/xe/svm: Introduce svm migration function [11/22] drm/xe/svm: implement functions to allocate and free device memory [12/22] drm/xe/svm: Trace buddy block allocation and free [13/22] drm/xe/svm: Handle CPU page fault [14/22] drm/xe/svm: trace svm range migration [15/22] drm/xe/svm: Implement functions to register and unregister mmu notifier [16/22] drm/xe/svm: Implement the mmu notifier range invalidate callback [17/22] drm/xe/svm: clean up svm range during process exit [18/22] drm/xe/svm: Move a few structures to xe_gt.h [19/22] drm/xe/svm: migrate svm range to vram [20/22] drm/xe/svm: Populate svm range [21/22] drm/xe/svm: GPU page fault support [22/22] drm/xe/svm: Add DRM_XE_SVM kernel config entry

diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index adf1dab5eba2..425de8e44deb 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -387,6 +387,37 @@ static u64 xe_migrate_res_sizes(struct xe_device *xe, struct xe_res_cursor *cur) cur->remaining); } +/** + * pte_update_cmd_size() - calculate the batch buffer command size + * to update a flat page table. + * + * @size: The virtual address range size of the page table to update + * + * The page table to update is supposed to be a flat 1 level page + * table with all entries pointing to 4k pages. + * + * Return the number of dwords of the update command + */ +static u32 pte_update_cmd_size(u64 size) +{ + u32 dword; + u64 entries = DIV_ROUND_UP(size, XE_PAGE_SIZE); + + XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER); + /* + * MI_STORE_DATA_IMM command is used to update page table. Each + * instruction can update maximumly 0x1ff pte entries. To update + * n (n <= 0x1ff) pte entries, we need: + * 1 dword for the MI_STORE_DATA_IMM command header (opcode etc) + * 2 dword for the page table's physical location + * 2*n dword for value of pte to fill (each pte entry is 2 dwords) + */ + dword = (1 + 2) * DIV_ROUND_UP(entries, 0x1ff); + dword += entries * 2; + + return dword; +} + static u32 pte_update_size(struct xe_migrate *m, bool is_vram, struct ttm_resource *res, @@ -492,6 +523,48 @@ static void emit_pte(struct xe_migrate *m, } } +/** + * build_pt_update_batch_sram() - build batch buffer commands to update + * migration vm page table for system memory + * + * @m: The migration context + * @bb: The batch buffer which hold the page table update commands + * @pt_offset: The offset of page table to update, in byte + * @dpa: device physical address you want the page table to point to + * @size: size of the virtual address space you want the page table to cover + */ +static void build_pt_update_batch_sram(struct xe_migrate *m, + struct xe_bb *bb, u32 pt_offset, + u64 dpa, u32 size) +{ + u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB]; + u32 ptes; + + ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); + while (ptes) { + u32 chunk = min(0x1ffU, ptes); + + bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); + bb->cs[bb->len++] = pt_offset; + bb->cs[bb->len++] = 0; + + pt_offset += chunk * 8; + ptes -= chunk; + + while (chunk--) { + u64 addr; + + addr = dpa & PAGE_MASK; + addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, + addr, pat_index, + 0, false, 0); + bb->cs[bb->len++] = lower_32_bits(addr); + bb->cs[bb->len++] = upper_32_bits(addr); + dpa += XE_PAGE_SIZE; + } + } +} + #define EMIT_COPY_CCS_DW 5 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, u64 dst_ofs, bool dst_is_indirect, @@ -808,6 +881,146 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, return fence; } +/** + * xe_migrate_svm() - A migrate function used by SVM subsystem + * + * @m: The migration context + * @src_dpa: device physical start address of source, from GPU's point of view + * @src_is_vram: True if source buffer is in vram. + * @dst_dpa: device physical start address of destination, from GPU's point of view + * @dst_is_vram: True if destination buffer is in vram. + * @size: The size of data to copy. + * + * Copy @size bytes of data from @src_dpa to @dst_dpa. The functionality + * and behavior of this function is similar to xe_migrate_copy function, but + * the interface is different. This function is a helper function supposed to + * be used by SVM subsytem. Since in SVM subsystem there is no buffer object + * and ttm, there is no src/dst bo as function input. Instead, we directly use + * src/dst's physical address as function input. + * + * Since the back store of any user malloc'ed or mmap'ed memory can be placed in + * system memory, it can not be compressed. Thus this function doesn't need + * to consider copy CCS (compression control surface) data as xe_migrate_copy did. + * + * This function assumes the source buffer and destination buffer are all physically + * contiguous. + * + * We use gpu blitter to copy data. Source and destination are first mapped to + * migration vm which is a flat one level (L0) page table, then blitter is used to + * perform the copy. + * + * Return: Pointer to a dma_fence representing the last copy batch, or + * an error pointer on failure. If there is a failure, any copy operation + * started by the function call has been synced. + */ +struct dma_fence *xe_migrate_svm(struct xe_migrate *m, + u64 src_dpa, + bool src_is_vram, + u64 dst_dpa, + bool dst_is_vram, + u64 size) +{ +#define NUM_PT_PER_BLIT (MAX_PREEMPTDISABLE_TRANSFER / SZ_2M) + struct xe_gt *gt = m->tile->primary_gt; + struct xe_device *xe = gt_to_xe(gt); + struct dma_fence *fence = NULL; + u64 src_L0_ofs, dst_L0_ofs; + u64 round_update_size; + /* A slot is a 4K page of page table, covers 2M virtual address*/ + u32 pt_slot; + int err; + + while (size) { + u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ + struct xe_sched_job *job; + struct xe_bb *bb; + u32 update_idx; + + /* Maximumly copy MAX_PREEMPTDISABLE_TRANSFER bytes. Why?*/ + round_update_size = min_t(u64, size, MAX_PREEMPTDISABLE_TRANSFER); + + /* src pte update*/ + if (!src_is_vram) + batch_size += pte_update_cmd_size(round_update_size); + /* dst pte update*/ + if (!dst_is_vram) + batch_size += pte_update_cmd_size(round_update_size); + + /* Copy command size*/ + batch_size += EMIT_COPY_DW; + + bb = xe_bb_new(gt, batch_size, true); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + goto err_sync; + } + + if (!src_is_vram) { + pt_slot = 0; + build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE, + src_dpa, round_update_size); + src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0); + } + else + src_L0_ofs = xe_migrate_vram_ofs(xe, src_dpa); + + if (!dst_is_vram) { + pt_slot = NUM_PT_PER_BLIT; + build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE, + dst_dpa, round_update_size); + dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0); + } + else + dst_L0_ofs = xe_migrate_vram_ofs(xe, dst_dpa); + + + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + update_idx = bb->len; + + emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size, + XE_PAGE_SIZE); + + mutex_lock(&m->job_mutex); + job = xe_bb_create_migration_job(m->q, bb, + xe_migrate_batch_base(m, true), + update_idx); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto err; + } + + xe_sched_job_add_migrate_flush(job, 0); + xe_sched_job_arm(job); + dma_fence_put(fence); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + dma_fence_put(m->fence); + m->fence = dma_fence_get(fence); + + mutex_unlock(&m->job_mutex); + + xe_bb_free(bb, fence); + size -= round_update_size; + src_dpa += round_update_size; + dst_dpa += round_update_size; + continue; + +err: + mutex_unlock(&m->job_mutex); + xe_bb_free(bb, NULL); + +err_sync: + /* Sync partial copy if any. FIXME: under job_mutex? */ + if (fence) { + dma_fence_wait(fence, false); + dma_fence_put(fence); + } + + return ERR_PTR(err); + } + + return fence; +} static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, u32 size, u32 pitch) { diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h index 951f19318ea4..a532760ae1fa 100644 --- a/drivers/gpu/drm/xe/xe_migrate.h +++ b/drivers/gpu/drm/xe/xe_migrate.h @@ -88,6 +88,13 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, struct ttm_resource *dst, bool copy_only_ccs); +struct dma_fence *xe_migrate_svm(struct xe_migrate *m, + u64 src_dpa, + bool src_is_vram, + u64 dst_dpa, + bool dst_is_vram, + u64 size); + struct dma_fence *xe_migrate_clear(struct xe_migrate *m, struct xe_bo *bo, struct ttm_resource *dst);

[10/22] drm/xe/svm: Introduce svm migration function

Commit Message

Patch