[4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs.

Message ID	1449002158-19156-4-git-send-email-eric@anholt.net (mailing list archive)
State	New, archived
Headers	show Return-Path: <dri-devel-bounces@lists.freedesktop.org> From: Eric Anholt <eric@anholt.net> To: dri-devel@lists.freedesktop.org Subject: [PATCH 4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs. Date: Tue, 1 Dec 2015 12:35:53 -0800 Message-Id: <1449002158-19156-4-git-send-email-eric@anholt.net> In-Reply-To: <1449002158-19156-1-git-send-email-eric@anholt.net> References: <1449002158-19156-1-git-send-email-eric@anholt.net> MIME-Version: 1.0 Cc: linux-kernel@vger.kernel.org Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: dri-devel-bounces@lists.freedesktop.org Sender: "dri-devel" <dri-devel-bounces@lists.freedesktop.org>

diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile index 32b4f9c..eb776a6 100644 --- a/drivers/gpu/drm/vc4/Makefile +++ b/drivers/gpu/drm/vc4/Makefile @@ -10,7 +10,8 @@ vc4-y := \ vc4_kms.o \ vc4_hdmi.o \ vc4_hvs.o \ - vc4_plane.o + vc4_plane.o \ + vc4_validate_shaders.o vc4-$(CONFIG_DEBUG_FS) += vc4_debugfs.o diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c index 06cba26..18dfe3e 100644 --- a/drivers/gpu/drm/vc4/vc4_bo.c +++ b/drivers/gpu/drm/vc4/vc4_bo.c @@ -79,6 +79,12 @@ static void vc4_bo_destroy(struct vc4_bo *bo) struct drm_gem_object *obj = &bo->base.base; struct vc4_dev *vc4 = to_vc4_dev(obj->dev); + if (bo->validated_shader) { + kfree(bo->validated_shader->texture_samples); + kfree(bo->validated_shader); + bo->validated_shader = NULL; + } + vc4->bo_stats.num_allocated--; vc4->bo_stats.size_allocated -= obj->size; drm_gem_cma_free_object(obj); @@ -315,6 +321,12 @@ void vc4_free_object(struct drm_gem_object *gem_bo) goto out; } + if (bo->validated_shader) { + kfree(bo->validated_shader->texture_samples); + kfree(bo->validated_shader); + bo->validated_shader = NULL; + } + bo->free_time = jiffies; list_add(&bo->size_head, cache_list); list_add(&bo->unref_head, &vc4->bo_cache.time_list); @@ -347,6 +359,78 @@ static void vc4_bo_cache_time_timer(unsigned long data) schedule_work(&vc4->bo_cache.time_work); } +struct dma_buf * +vc4_prime_export(struct drm_device *dev, struct drm_gem_object *obj, int flags) +{ + struct vc4_bo *bo = to_vc4_bo(obj); + + if (bo->validated_shader) { + DRM_ERROR("Attempting to export shader BO\n"); + return ERR_PTR(-EINVAL); + } + + return drm_gem_prime_export(dev, obj, flags); +} + +int vc4_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct drm_gem_object *gem_obj; + struct vc4_bo *bo; + int ret; + + ret = drm_gem_mmap(filp, vma); + if (ret) + return ret; + + gem_obj = vma->vm_private_data; + bo = to_vc4_bo(gem_obj); + + if (bo->validated_shader && (vma->vm_flags & VM_WRITE)) { + DRM_ERROR("mmaping of shader BOs for writing not allowed.\n"); + return -EINVAL; + } + + /* + * Clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the + * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map + * the whole buffer. + */ + vma->vm_flags &= ~VM_PFNMAP; + vma->vm_pgoff = 0; + + ret = dma_mmap_writecombine(bo->base.base.dev->dev, vma, + bo->base.vaddr, bo->base.paddr, + vma->vm_end - vma->vm_start); + if (ret) + drm_gem_vm_close(vma); + + return ret; +} + +int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) +{ + struct vc4_bo *bo = to_vc4_bo(obj); + + if (bo->validated_shader && (vma->vm_flags & VM_WRITE)) { + DRM_ERROR("mmaping of shader BOs for writing not allowed.\n"); + return -EINVAL; + } + + return drm_gem_cma_prime_mmap(obj, vma); +} + +void *vc4_prime_vmap(struct drm_gem_object *obj) +{ + struct vc4_bo *bo = to_vc4_bo(obj); + + if (bo->validated_shader) { + DRM_ERROR("mmaping of shader BOs not allowed.\n"); + return ERR_PTR(-EINVAL); + } + + return drm_gem_cma_prime_vmap(obj); +} + int vc4_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { @@ -387,6 +471,62 @@ int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data, return 0; } +int +vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data, + struct drm_file *file_priv) +{ + struct drm_vc4_create_shader_bo *args = data; + struct vc4_bo *bo = NULL; + int ret; + + if (args->size == 0) + return -EINVAL; + + if (args->size % sizeof(u64) != 0) + return -EINVAL; + + if (args->flags != 0) { + DRM_INFO("Unknown flags set: 0x%08x\n", args->flags); + return -EINVAL; + } + + if (args->pad != 0) { + DRM_INFO("Pad set: 0x%08x\n", args->pad); + return -EINVAL; + } + + bo = vc4_bo_create(dev, args->size, true); + if (!bo) + return -ENOMEM; + + ret = copy_from_user(bo->base.vaddr, + (void __user *)(uintptr_t)args->data, + args->size); + if (ret != 0) + goto fail; + /* Clear the rest of the memory from allocating from the BO + * cache. + */ + memset(bo->base.vaddr + args->size, 0, + bo->base.base.size - args->size); + + bo->validated_shader = vc4_validate_shader(&bo->base); + if (!bo->validated_shader) { + ret = -EINVAL; + goto fail; + } + + /* We have to create the handle after validation, to avoid + * races for users to do doing things like mmap the shader BO. + */ + ret = drm_gem_handle_create(file_priv, &bo->base.base, &args->handle); + + fail: + drm_gem_object_unreference_unlocked(&bo->base.base); + + return ret; +} + void vc4_bo_cache_init(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c index 5fa4688..da4be9c 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.c +++ b/drivers/gpu/drm/vc4/vc4_drv.c @@ -64,7 +64,7 @@ static const struct file_operations vc4_drm_fops = { .open = drm_open, .release = drm_release, .unlocked_ioctl = drm_ioctl, - .mmap = drm_gem_cma_mmap, + .mmap = vc4_mmap, .poll = drm_poll, .read = drm_read, #ifdef CONFIG_COMPAT @@ -76,6 +76,7 @@ static const struct file_operations vc4_drm_fops = { static const struct drm_ioctl_desc vc4_drm_ioctls[] = { DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0), DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0), + DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, 0), }; static struct drm_driver vc4_drm_driver = { @@ -102,12 +103,12 @@ static struct drm_driver vc4_drm_driver = { .prime_handle_to_fd = drm_gem_prime_handle_to_fd, .prime_fd_to_handle = drm_gem_prime_fd_to_handle, .gem_prime_import = drm_gem_prime_import, - .gem_prime_export = drm_gem_prime_export, + .gem_prime_export = vc4_prime_export, .gem_prime_get_sg_table = drm_gem_cma_prime_get_sg_table, .gem_prime_import_sg_table = drm_gem_cma_prime_import_sg_table, - .gem_prime_vmap = drm_gem_cma_prime_vmap, + .gem_prime_vmap = vc4_prime_vmap, .gem_prime_vunmap = drm_gem_cma_prime_vunmap, - .gem_prime_mmap = drm_gem_cma_prime_mmap, + .gem_prime_mmap = vc4_prime_mmap, .dumb_create = vc4_dumb_create, .dumb_map_offset = drm_gem_cma_dumb_map_offset, diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h index fddb0a0..bd77d55 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.h +++ b/drivers/gpu/drm/vc4/vc4_drv.h @@ -69,6 +69,11 @@ struct vc4_bo { /* List entry for the BO's position in vc4_dev->bo_cache.size_list */ struct list_head size_head; + + /* Struct for shader validation state, if created by + * DRM_IOCTL_VC4_CREATE_SHADER_BO. + */ + struct vc4_validated_shader_info *validated_shader; }; static inline struct vc4_bo * @@ -118,6 +123,42 @@ to_vc4_encoder(struct drm_encoder *encoder) #define HVS_WRITE(offset, val) writel(val, vc4->hvs->regs + offset) /** + * struct vc4_texture_sample_info - saves the offsets into the UBO for texture + * setup parameters. + * + * This will be used at draw time to relocate the reference to the texture + * contents in p0, and validate that the offset combined with + * width/height/stride/etc. from p1 and p2/p3 doesn't sample outside the BO. + * Note that the hardware treats unprovided config parameters as 0, so not all + * of them need to be set up for every texure sample, and we'll store ~0 as + * the offset to mark the unused ones. + * + * See the VC4 3D architecture guide page 41 ("Texture and Memory Lookup Unit + * Setup") for definitions of the texture parameters. + */ +struct vc4_texture_sample_info { + bool is_direct; + uint32_t p_offset[4]; +}; + +/** + * struct vc4_validated_shader_info - information about validated shaders that + * needs to be used from command list validation. + * + * For a given shader, each time a shader state record references it, we need + * to verify that the shader doesn't read more uniforms than the shader state + * record's uniform BO pointer can provide, and we need to apply relocations + * and validate the shader state record's uniforms that define the texture + * samples. + */ +struct vc4_validated_shader_info { + uint32_t uniforms_size; + uint32_t uniforms_src_size; + uint32_t num_texture_samples; + struct vc4_texture_sample_info *texture_samples; +}; + +/** * _wait_for - magic (register) wait macro * * Does the right thing for modeset paths when run under kdgb or similar atomic @@ -157,8 +198,13 @@ struct dma_buf *vc4_prime_export(struct drm_device *dev, struct drm_gem_object *obj, int flags); int vc4_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); +int vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data, + struct drm_file *file_priv); int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); +int vc4_mmap(struct file *filp, struct vm_area_struct *vma); +int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma); +void *vc4_prime_vmap(struct drm_gem_object *obj); void vc4_bo_cache_init(struct drm_device *dev); void vc4_bo_cache_destroy(struct drm_device *dev); int vc4_bo_stats_debugfs(struct seq_file *m, void *arg); @@ -194,3 +240,7 @@ struct drm_plane *vc4_plane_init(struct drm_device *dev, enum drm_plane_type type); u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist); u32 vc4_plane_dlist_size(struct drm_plane_state *state); + +/* vc4_validate_shader.c */ +struct vc4_validated_shader_info * +vc4_validate_shader(struct drm_gem_cma_object *shader_obj); diff --git a/drivers/gpu/drm/vc4/vc4_qpu_defines.h b/drivers/gpu/drm/vc4/vc4_qpu_defines.h new file mode 100644 index 0000000..d5c2f3c --- /dev/null +++ b/drivers/gpu/drm/vc4/vc4_qpu_defines.h @@ -0,0 +1,264 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef VC4_QPU_DEFINES_H +#define VC4_QPU_DEFINES_H + +enum qpu_op_add { + QPU_A_NOP, + QPU_A_FADD, + QPU_A_FSUB, + QPU_A_FMIN, + QPU_A_FMAX, + QPU_A_FMINABS, + QPU_A_FMAXABS, + QPU_A_FTOI, + QPU_A_ITOF, + QPU_A_ADD = 12, + QPU_A_SUB, + QPU_A_SHR, + QPU_A_ASR, + QPU_A_ROR, + QPU_A_SHL, + QPU_A_MIN, + QPU_A_MAX, + QPU_A_AND, + QPU_A_OR, + QPU_A_XOR, + QPU_A_NOT, + QPU_A_CLZ, + QPU_A_V8ADDS = 30, + QPU_A_V8SUBS = 31, +}; + +enum qpu_op_mul { + QPU_M_NOP, + QPU_M_FMUL, + QPU_M_MUL24, + QPU_M_V8MULD, + QPU_M_V8MIN, + QPU_M_V8MAX, + QPU_M_V8ADDS, + QPU_M_V8SUBS, +}; + +enum qpu_raddr { + QPU_R_FRAG_PAYLOAD_ZW = 15, /* W for A file, Z for B file */ + /* 0-31 are the plain regfile a or b fields */ + QPU_R_UNIF = 32, + QPU_R_VARY = 35, + QPU_R_ELEM_QPU = 38, + QPU_R_NOP, + QPU_R_XY_PIXEL_COORD = 41, + QPU_R_MS_REV_FLAGS = 41, + QPU_R_VPM = 48, + QPU_R_VPM_LD_BUSY, + QPU_R_VPM_LD_WAIT, + QPU_R_MUTEX_ACQUIRE, +}; + +enum qpu_waddr { + /* 0-31 are the plain regfile a or b fields */ + QPU_W_ACC0 = 32, /* aka r0 */ + QPU_W_ACC1, + QPU_W_ACC2, + QPU_W_ACC3, + QPU_W_TMU_NOSWAP, + QPU_W_ACC5, + QPU_W_HOST_INT, + QPU_W_NOP, + QPU_W_UNIFORMS_ADDRESS, + QPU_W_QUAD_XY, /* X for regfile a, Y for regfile b */ + QPU_W_MS_FLAGS = 42, + QPU_W_REV_FLAG = 42, + QPU_W_TLB_STENCIL_SETUP = 43, + QPU_W_TLB_Z, + QPU_W_TLB_COLOR_MS, + QPU_W_TLB_COLOR_ALL, + QPU_W_TLB_ALPHA_MASK, + QPU_W_VPM, + QPU_W_VPMVCD_SETUP, /* LD for regfile a, ST for regfile b */ + QPU_W_VPM_ADDR, /* LD for regfile a, ST for regfile b */ + QPU_W_MUTEX_RELEASE, + QPU_W_SFU_RECIP, + QPU_W_SFU_RECIPSQRT, + QPU_W_SFU_EXP, + QPU_W_SFU_LOG, + QPU_W_TMU0_S, + QPU_W_TMU0_T, + QPU_W_TMU0_R, + QPU_W_TMU0_B, + QPU_W_TMU1_S, + QPU_W_TMU1_T, + QPU_W_TMU1_R, + QPU_W_TMU1_B, +}; + +enum qpu_sig_bits { + QPU_SIG_SW_BREAKPOINT, + QPU_SIG_NONE, + QPU_SIG_THREAD_SWITCH, + QPU_SIG_PROG_END, + QPU_SIG_WAIT_FOR_SCOREBOARD, + QPU_SIG_SCOREBOARD_UNLOCK, + QPU_SIG_LAST_THREAD_SWITCH, + QPU_SIG_COVERAGE_LOAD, + QPU_SIG_COLOR_LOAD, + QPU_SIG_COLOR_LOAD_END, + QPU_SIG_LOAD_TMU0, + QPU_SIG_LOAD_TMU1, + QPU_SIG_ALPHA_MASK_LOAD, + QPU_SIG_SMALL_IMM, + QPU_SIG_LOAD_IMM, + QPU_SIG_BRANCH +}; + +enum qpu_mux { + /* hardware mux values */ + QPU_MUX_R0, + QPU_MUX_R1, + QPU_MUX_R2, + QPU_MUX_R3, + QPU_MUX_R4, + QPU_MUX_R5, + QPU_MUX_A, + QPU_MUX_B, + + /* non-hardware mux values */ + QPU_MUX_IMM, +}; + +enum qpu_cond { + QPU_COND_NEVER, + QPU_COND_ALWAYS, + QPU_COND_ZS, + QPU_COND_ZC, + QPU_COND_NS, + QPU_COND_NC, + QPU_COND_CS, + QPU_COND_CC, +}; + +enum qpu_pack_mul { + QPU_PACK_MUL_NOP, + /* replicated to each 8 bits of the 32-bit dst. */ + QPU_PACK_MUL_8888 = 3, + QPU_PACK_MUL_8A, + QPU_PACK_MUL_8B, + QPU_PACK_MUL_8C, + QPU_PACK_MUL_8D, +}; + +enum qpu_pack_a { + QPU_PACK_A_NOP, + /* convert to 16 bit float if float input, or to int16. */ + QPU_PACK_A_16A, + QPU_PACK_A_16B, + /* replicated to each 8 bits of the 32-bit dst. */ + QPU_PACK_A_8888, + /* Convert to 8-bit unsigned int. */ + QPU_PACK_A_8A, + QPU_PACK_A_8B, + QPU_PACK_A_8C, + QPU_PACK_A_8D, + + /* Saturating variants of the previous instructions. */ + QPU_PACK_A_32_SAT, /* int-only */ + QPU_PACK_A_16A_SAT, /* int or float */ + QPU_PACK_A_16B_SAT, + QPU_PACK_A_8888_SAT, + QPU_PACK_A_8A_SAT, + QPU_PACK_A_8B_SAT, + QPU_PACK_A_8C_SAT, + QPU_PACK_A_8D_SAT, +}; + +enum qpu_unpack_r4 { + QPU_UNPACK_R4_NOP, + QPU_UNPACK_R4_F16A_TO_F32, + QPU_UNPACK_R4_F16B_TO_F32, + QPU_UNPACK_R4_8D_REP, + QPU_UNPACK_R4_8A, + QPU_UNPACK_R4_8B, + QPU_UNPACK_R4_8C, + QPU_UNPACK_R4_8D, +}; + +#define QPU_MASK(high, low) \ + ((((uint64_t)1 << ((high) - (low) + 1)) - 1) << (low)) + +#define QPU_GET_FIELD(word, field) \ + ((uint32_t)(((word) & field ## _MASK) >> field ## _SHIFT)) + +#define QPU_SIG_SHIFT 60 +#define QPU_SIG_MASK QPU_MASK(63, 60) + +#define QPU_UNPACK_SHIFT 57 +#define QPU_UNPACK_MASK QPU_MASK(59, 57) + +/** + * If set, the pack field means PACK_MUL or R4 packing, instead of normal + * regfile a packing. + */ +#define QPU_PM ((uint64_t)1 << 56) + +#define QPU_PACK_SHIFT 52 +#define QPU_PACK_MASK QPU_MASK(55, 52) + +#define QPU_COND_ADD_SHIFT 49 +#define QPU_COND_ADD_MASK QPU_MASK(51, 49) +#define QPU_COND_MUL_SHIFT 46 +#define QPU_COND_MUL_MASK QPU_MASK(48, 46) + +#define QPU_SF ((uint64_t)1 << 45) + +#define QPU_WADDR_ADD_SHIFT 38 +#define QPU_WADDR_ADD_MASK QPU_MASK(43, 38) +#define QPU_WADDR_MUL_SHIFT 32 +#define QPU_WADDR_MUL_MASK QPU_MASK(37, 32) + +#define QPU_OP_MUL_SHIFT 29 +#define QPU_OP_MUL_MASK QPU_MASK(31, 29) + +#define QPU_RADDR_A_SHIFT 18 +#define QPU_RADDR_A_MASK QPU_MASK(23, 18) +#define QPU_RADDR_B_SHIFT 12 +#define QPU_RADDR_B_MASK QPU_MASK(17, 12) +#define QPU_SMALL_IMM_SHIFT 12 +#define QPU_SMALL_IMM_MASK QPU_MASK(17, 12) + +#define QPU_ADD_A_SHIFT 9 +#define QPU_ADD_A_MASK QPU_MASK(11, 9) +#define QPU_ADD_B_SHIFT 6 +#define QPU_ADD_B_MASK QPU_MASK(8, 6) +#define QPU_MUL_A_SHIFT 3 +#define QPU_MUL_A_MASK QPU_MASK(5, 3) +#define QPU_MUL_B_SHIFT 0 +#define QPU_MUL_B_MASK QPU_MASK(2, 0) + +#define QPU_WS ((uint64_t)1 << 44) + +#define QPU_OP_ADD_SHIFT 24 +#define QPU_OP_ADD_MASK QPU_MASK(28, 24) + +#endif /* VC4_QPU_DEFINES_H */ diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c new file mode 100644 index 0000000..f67124b --- /dev/null +++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c @@ -0,0 +1,513 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * DOC: Shader validator for VC4. + * + * The VC4 has no IOMMU between it and system memory, so a user with + * access to execute shaders could escalate privilege by overwriting + * system memory (using the VPM write address register in the + * general-purpose DMA mode) or reading system memory it shouldn't + * (reading it as a texture, or uniform data, or vertex data). + * + * This walks over a shader BO, ensuring that its accesses are + * appropriately bounded, and recording how many texture accesses are + * made and where so that we can do relocations for them in the + * uniform stream. + */ + +#include "vc4_drv.h" +#include "vc4_qpu_defines.h" + +struct vc4_shader_validation_state { + struct vc4_texture_sample_info tmu_setup[2]; + int tmu_write_count[2]; + + /* For registers that were last written to by a MIN instruction with + * one argument being a uniform, the address of the uniform. + * Otherwise, ~0. + * + * This is used for the validation of direct address memory reads. + */ + uint32_t live_min_clamp_offsets[32 + 32 + 4]; + bool live_max_clamp_regs[32 + 32 + 4]; +}; + +static uint32_t +waddr_to_live_reg_index(uint32_t waddr, bool is_b) +{ + if (waddr < 32) { + if (is_b) + return 32 + waddr; + else + return waddr; + } else if (waddr <= QPU_W_ACC3) { + return 64 + waddr - QPU_W_ACC0; + } else { + return ~0; + } +} + +static uint32_t +raddr_add_a_to_live_reg_index(uint64_t inst) +{ + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + + if (add_a == QPU_MUX_A) + return raddr_a; + else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) + return 32 + raddr_b; + else if (add_a <= QPU_MUX_R3) + return 64 + add_a; + else + return ~0; +} + +static bool +is_tmu_submit(uint32_t waddr) +{ + return (waddr == QPU_W_TMU0_S || + waddr == QPU_W_TMU1_S); +} + +static bool +is_tmu_write(uint32_t waddr) +{ + return (waddr >= QPU_W_TMU0_S && + waddr <= QPU_W_TMU1_B); +} + +static bool +record_texture_sample(struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state, + int tmu) +{ + uint32_t s = validated_shader->num_texture_samples; + int i; + struct vc4_texture_sample_info *temp_samples; + + temp_samples = krealloc(validated_shader->texture_samples, + (s + 1) * sizeof(*temp_samples), + GFP_KERNEL); + if (!temp_samples) + return false; + + memcpy(&temp_samples[s], + &validation_state->tmu_setup[tmu], + sizeof(*temp_samples)); + + validated_shader->num_texture_samples = s + 1; + validated_shader->texture_samples = temp_samples; + + for (i = 0; i < 4; i++) + validation_state->tmu_setup[tmu].p_offset[i] = ~0; + + return true; +} + +static bool +check_tmu_write(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state, + bool is_mul) +{ + uint32_t waddr = (is_mul ? + QPU_GET_FIELD(inst, QPU_WADDR_MUL) : + QPU_GET_FIELD(inst, QPU_WADDR_ADD)); + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + int tmu = waddr > QPU_W_TMU0_B; + bool submit = is_tmu_submit(waddr); + bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + if (is_direct) { + uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); + uint32_t clamp_reg, clamp_offset; + + if (sig == QPU_SIG_SMALL_IMM) { + DRM_ERROR("direct TMU read used small immediate\n"); + return false; + } + + /* Make sure that this texture load is an add of the base + * address of the UBO to a clamped offset within the UBO. + */ + if (is_mul || + QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { + DRM_ERROR("direct TMU load wasn't an add\n"); + return false; + } + + /* We assert that the the clamped address is the first + * argument, and the UBO base address is the second argument. + * This is arbitrary, but simpler than supporting flipping the + * two either way. + */ + clamp_reg = raddr_add_a_to_live_reg_index(inst); + if (clamp_reg == ~0) { + DRM_ERROR("direct TMU load wasn't clamped\n"); + return false; + } + + clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg]; + if (clamp_offset == ~0) { + DRM_ERROR("direct TMU load wasn't clamped\n"); + return false; + } + + /* Store the clamp value's offset in p1 (see reloc_tex() in + * vc4_validate.c). + */ + validation_state->tmu_setup[tmu].p_offset[1] = + clamp_offset; + + if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && + !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { + DRM_ERROR("direct TMU load didn't add to a uniform\n"); + return false; + } + + validation_state->tmu_setup[tmu].is_direct = true; + } else { + if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM && + raddr_b == QPU_R_UNIF)) { + DRM_ERROR("uniform read in the same instruction as " + "texture setup.\n"); + return false; + } + } + + if (validation_state->tmu_write_count[tmu] >= 4) { + DRM_ERROR("TMU%d got too many parameters before dispatch\n", + tmu); + return false; + } + validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] = + validated_shader->uniforms_size; + validation_state->tmu_write_count[tmu]++; + /* Since direct uses a RADDR uniform reference, it will get counted in + * check_instruction_reads() + */ + if (!is_direct) + validated_shader->uniforms_size += 4; + + if (submit) { + if (!record_texture_sample(validated_shader, + validation_state, tmu)) { + return false; + } + + validation_state->tmu_write_count[tmu] = 0; + } + + return true; +} + +static bool +check_reg_write(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state, + bool is_mul) +{ + uint32_t waddr = (is_mul ? + QPU_GET_FIELD(inst, QPU_WADDR_MUL) : + QPU_GET_FIELD(inst, QPU_WADDR_ADD)); + + switch (waddr) { + case QPU_W_UNIFORMS_ADDRESS: + /* XXX: We'll probably need to support this for reladdr, but + * it's definitely a security-related one. + */ + DRM_ERROR("uniforms address load unsupported\n"); + return false; + + case QPU_W_TLB_COLOR_MS: + case QPU_W_TLB_COLOR_ALL: + case QPU_W_TLB_Z: + /* These only interact with the tile buffer, not main memory, + * so they're safe. + */ + return true; + + case QPU_W_TMU0_S: + case QPU_W_TMU0_T: + case QPU_W_TMU0_R: + case QPU_W_TMU0_B: + case QPU_W_TMU1_S: + case QPU_W_TMU1_T: + case QPU_W_TMU1_R: + case QPU_W_TMU1_B: + return check_tmu_write(inst, validated_shader, validation_state, + is_mul); + + case QPU_W_HOST_INT: + case QPU_W_TMU_NOSWAP: + case QPU_W_TLB_ALPHA_MASK: + case QPU_W_MUTEX_RELEASE: + /* XXX: I haven't thought about these, so don't support them + * for now. + */ + DRM_ERROR("Unsupported waddr %d\n", waddr); + return false; + + case QPU_W_VPM_ADDR: + DRM_ERROR("General VPM DMA unsupported\n"); + return false; + + case QPU_W_VPM: + case QPU_W_VPMVCD_SETUP: + /* We allow VPM setup in general, even including VPM DMA + * configuration setup, because the (unsafe) DMA can only be + * triggered by QPU_W_VPM_ADDR writes. + */ + return true; + + case QPU_W_TLB_STENCIL_SETUP: + return true; + } + + return true; +} + +static void +track_live_clamps(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state) +{ + uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD); + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); + uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); + uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + bool ws = inst & QPU_WS; + uint32_t lri_add_a, lri_add, lri_mul; + bool add_a_is_min_0; + + /* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0), + * before we clear previous live state. + */ + lri_add_a = raddr_add_a_to_live_reg_index(inst); + add_a_is_min_0 = (lri_add_a != ~0 && + validation_state->live_max_clamp_regs[lri_add_a]); + + /* Clear live state for registers written by our instruction. */ + lri_add = waddr_to_live_reg_index(waddr_add, ws); + lri_mul = waddr_to_live_reg_index(waddr_mul, !ws); + if (lri_mul != ~0) { + validation_state->live_max_clamp_regs[lri_mul] = false; + validation_state->live_min_clamp_offsets[lri_mul] = ~0; + } + if (lri_add != ~0) { + validation_state->live_max_clamp_regs[lri_add] = false; + validation_state->live_min_clamp_offsets[lri_add] = ~0; + } else { + /* Nothing further to do for live tracking, since only ADDs + * generate new live clamp registers. + */ + return; + } + + /* Now, handle remaining live clamp tracking for the ADD operation. */ + + if (cond_add != QPU_COND_ALWAYS) + return; + + if (op_add == QPU_A_MAX) { + /* Track live clamps of a value to a minimum of 0 (in either + * arg). + */ + if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 || + (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) { + return; + } + + validation_state->live_max_clamp_regs[lri_add] = true; + } else if (op_add == QPU_A_MIN) { + /* Track live clamps of a value clamped to a minimum of 0 and + * a maximum of some uniform's offset. + */ + if (!add_a_is_min_0) + return; + + if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && + !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF && + sig != QPU_SIG_SMALL_IMM)) { + return; + } + + validation_state->live_min_clamp_offsets[lri_add] = + validated_shader->uniforms_size; + } +} + +static bool +check_instruction_writes(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state) +{ + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + bool ok; + + if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) { + DRM_ERROR("ADD and MUL both set up textures\n"); + return false; + } + + ok = (check_reg_write(inst, validated_shader, validation_state, + false) && + check_reg_write(inst, validated_shader, validation_state, + true)); + + track_live_clamps(inst, validated_shader, validation_state); + + return ok; +} + +static bool +check_instruction_reads(uint64_t inst, + struct vc4_validated_shader_info *validated_shader) +{ + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + if (raddr_a == QPU_R_UNIF || + (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) { + /* This can't overflow the uint32_t, because we're reading 8 + * bytes of instruction to increment by 4 here, so we'd + * already be OOM. + */ + validated_shader->uniforms_size += 4; + } + + return true; +} + +struct vc4_validated_shader_info * +vc4_validate_shader(struct drm_gem_cma_object *shader_obj) +{ + bool found_shader_end = false; + int shader_end_ip = 0; + uint32_t ip, max_ip; + uint64_t *shader; + struct vc4_validated_shader_info *validated_shader; + struct vc4_shader_validation_state validation_state; + int i; + + memset(&validation_state, 0, sizeof(validation_state)); + + for (i = 0; i < 8; i++) + validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0; + for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++) + validation_state.live_min_clamp_offsets[i] = ~0; + + shader = shader_obj->vaddr; + max_ip = shader_obj->base.size / sizeof(uint64_t); + + validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL); + if (!validated_shader) + return NULL; + + for (ip = 0; ip < max_ip; ip++) { + uint64_t inst = shader[ip]; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + switch (sig) { + case QPU_SIG_NONE: + case QPU_SIG_WAIT_FOR_SCOREBOARD: + case QPU_SIG_SCOREBOARD_UNLOCK: + case QPU_SIG_COLOR_LOAD: + case QPU_SIG_LOAD_TMU0: + case QPU_SIG_LOAD_TMU1: + case QPU_SIG_PROG_END: + case QPU_SIG_SMALL_IMM: + if (!check_instruction_writes(inst, validated_shader, + &validation_state)) { + DRM_ERROR("Bad write at ip %d\n", ip); + goto fail; + } + + if (!check_instruction_reads(inst, validated_shader)) + goto fail; + + if (sig == QPU_SIG_PROG_END) { + found_shader_end = true; + shader_end_ip = ip; + } + + break; + + case QPU_SIG_LOAD_IMM: + if (!check_instruction_writes(inst, validated_shader, + &validation_state)) { + DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip); + goto fail; + } + break; + + default: + DRM_ERROR("Unsupported QPU signal %d at " + "instruction %d\n", sig, ip); + goto fail; + } + + /* There are two delay slots after program end is signaled + * that are still executed, then we're finished. + */ + if (found_shader_end && ip == shader_end_ip + 2) + break; + } + + if (ip == max_ip) { + DRM_ERROR("shader failed to terminate before " + "shader BO end at %zd\n", + shader_obj->base.size); + goto fail; + } + + /* Again, no chance of integer overflow here because the worst case + * scenario is 8 bytes of uniforms plus handles per 8-byte + * instruction. + */ + validated_shader->uniforms_src_size = + (validated_shader->uniforms_size + + 4 * validated_shader->num_texture_samples); + + return validated_shader; + +fail: + if (validated_shader) { + kfree(validated_shader->texture_samples); + kfree(validated_shader); + } + return NULL; +} diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h index 068aab9..0716b50 100644 --- a/include/uapi/drm/vc4_drm.h +++ b/include/uapi/drm/vc4_drm.h @@ -28,9 +28,11 @@ #define DRM_VC4_CREATE_BO 0x03 #define DRM_VC4_MMAP_BO 0x04 +#define DRM_VC4_CREATE_SHADER_BO 0x05 #define DRM_IOCTL_VC4_CREATE_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo) #define DRM_IOCTL_VC4_MMAP_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo) +#define DRM_IOCTL_VC4_CREATE_SHADER_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo) /** * struct drm_vc4_create_bo - ioctl argument for creating VC4 BOs. @@ -65,4 +67,27 @@ struct drm_vc4_mmap_bo { uint64_t offset; }; +/** + * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4 + * shader BOs. + * + * Since allowing a shader to be overwritten while it's also being + * executed from would allow privlege escalation, shaders must be + * created using this ioctl, and they can't be mmapped later. + */ +struct drm_vc4_create_shader_bo { + /* Size of the data argument. */ + uint32_t size; + /* Flags, currently must be 0. */ + uint32_t flags; + + /* Pointer to the data. */ + uint64_t data; + + /** Returned GEM handle for the BO. */ + uint32_t handle; + /* Pad, must be 0. */ + uint32_t pad; +}; + #endif /* _UAPI_VC4_DRM_H_ */

[4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs.

Commit Message

Comments

Patch