diff mbox

sna: Experimental support for write-combining mmaps (wc-mmap)

Message ID CA+icZUXXtpfZfnz+B-PpiFRg+tHozevTYzQ0-moeKq+1+m=ZWg@mail.gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Sedat Dilek Nov. 13, 2014, 3:28 p.m. UTC
Hi,

what is the status of drm-intel-wc-mmap patchset (#2 + #3)?
I have refreshed them on top of drm-intel-coherent-phys-gtt patch (#1).
Playing with that againt Linux v3.18-rc4.

Regards,
- Sedat -

Comments

Daniel Vetter Nov. 14, 2014, 10:04 a.m. UTC | #1
On Thu, Nov 13, 2014 at 04:28:46PM +0100, Sedat Dilek wrote:
> Hi,
> 
> what is the status of drm-intel-wc-mmap patchset (#2 + #3)?
> I have refreshed them on top of drm-intel-coherent-phys-gtt patch (#1).
> Playing with that againt Linux v3.18-rc4.

Waiting for the misssing testcases and remainig reviews. I tried to
volunteer Akash for that, but apparently failed.
-Daniel

> 
> Regards,
> - Sedat -

> From 20a70ef5865104f35bc8e4cd11ca8ae3b7e6051a Mon Sep 17 00:00:00 2001
> From: Chris Wilson <chris@chris-wilson.co.uk>
> Date: Tue, 4 Nov 2014 04:51:40 -0800
> Subject: [PATCH 1/3] drm/i915: Make the physical object coherent with GTT
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> Currently objects for which the hardware needs a contiguous physical
> address are allocated a shadow backing storage to satisfy the contraint.
> This shadow buffer is not wired into the normal obj->pages and so the
> physical object is incoherent with accesses via the GPU, GTT and CPU. By
> setting up the appropriate scatter-gather table, we can allow userspace
> to access the physical object via either a GTT mmaping of or by rendering
> into the GEM bo. However, keeping the CPU mmap of the shmemfs backing
> storage coherent with the contiguous shadow is not yet possible.
> Fortuituously, CPU mmaps of objects requiring physical addresses are not
> expected to be coherent anyway.
> 
> This allows the physical constraint of the GEM object to be transparent
> to userspace and allow it to efficiently render into or update them via
> the GTT and GPU.
> 
> v2: Fix leak of pci handle spotted by Ville
> v3: Remove the now duplicate call to detach_phys_object during free.
> v4: Wait for rendering before pwrite. As this patch makes it possible to
> render into the phys object, we should make it correct as well!
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |   3 +
>  drivers/gpu/drm/i915/i915_drv.h |   6 +-
>  drivers/gpu/drm/i915/i915_gem.c | 207 +++++++++++++++++++++++++++-------------
>  include/uapi/drm/i915_drm.h     |   1 +
>  4 files changed, 150 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 1403b01..9b08853 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1027,6 +1027,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  	case I915_PARAM_CMD_PARSER_VERSION:
>  		value = i915_cmd_parser_get_version();
>  		break;
> +	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
> +		value = 1;
> +		break;
>  	default:
>  		DRM_DEBUG("Unknown parameter %d\n", param->param);
>  		return -EINVAL;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 16a6f6d..0417784 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1864,10 +1864,10 @@ struct drm_i915_gem_object {
>  	unsigned long user_pin_count;
>  	struct drm_file *pin_filp;
>  
> -	/** for phy allocated objects */
> -	struct drm_dma_handle *phys_handle;
> -
>  	union {
> +		/** for phy allocated objects */
> +		struct drm_dma_handle *phys_handle;
> +
>  		struct i915_gem_userptr {
>  			uintptr_t ptr;
>  			unsigned read_only :1;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 28f91df..124ec85 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -208,40 +208,137 @@ i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
>  	return 0;
>  }
>  
> -static void i915_gem_object_detach_phys(struct drm_i915_gem_object *obj)
> +static int
> +i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
>  {
> -	drm_dma_handle_t *phys = obj->phys_handle;
> +	struct address_space *mapping = file_inode(obj->base.filp)->i_mapping;
> +	char *vaddr = obj->phys_handle->vaddr;
> +	struct sg_table *st;
> +	struct scatterlist *sg;
> +	int i;
>  
> -	if (!phys)
> -		return;
> +	if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
> +		return -EINVAL;
> +
> +	for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
> +		struct page *page;
> +		char *src;
> +
> +		page = shmem_read_mapping_page(mapping, i);
> +		if (IS_ERR(page))
> +			return PTR_ERR(page);
> +
> +		src = kmap_atomic(page);
> +		memcpy(vaddr, src, PAGE_SIZE);
> +		drm_clflush_virt_range(vaddr, PAGE_SIZE);
> +		kunmap_atomic(src);
> +
> +		page_cache_release(page);
> +		vaddr += PAGE_SIZE;
> +	}
> +
> +	i915_gem_chipset_flush(obj->base.dev);
> +
> +	st = kmalloc(sizeof(*st), GFP_KERNEL);
> +	if (st == NULL)
> +		return -ENOMEM;
> +
> +	if (sg_alloc_table(st, 1, GFP_KERNEL)) {
> +		kfree(st);
> +		return -ENOMEM;
> +	}
> +
> +	sg = st->sgl;
> +	sg->offset = 0;
> +	sg->length = obj->base.size;
>  
> -	if (obj->madv == I915_MADV_WILLNEED) {
> +	sg_dma_address(sg) = obj->phys_handle->busaddr;
> +	sg_dma_len(sg) = obj->base.size;
> +
> +	obj->pages = st;
> +	obj->has_dma_mapping = true;
> +	return 0;
> +}
> +
> +static void
> +i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj)
> +{
> +	int ret;
> +
> +	BUG_ON(obj->madv == __I915_MADV_PURGED);
> +
> +	ret = i915_gem_object_set_to_cpu_domain(obj, true);
> +	if (ret) {
> +		/* In the event of a disaster, abandon all caches and
> +		 * hope for the best.
> +		 */
> +		WARN_ON(ret != -EIO);
> +		obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +	}
> +
> +	if (obj->madv == I915_MADV_DONTNEED)
> +		obj->dirty = 0;
> +
> +	if (obj->dirty) {
>  		struct address_space *mapping = file_inode(obj->base.filp)->i_mapping;
> -		char *vaddr = phys->vaddr;
> +		char *vaddr = obj->phys_handle->vaddr;
>  		int i;
>  
>  		for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
> -			struct page *page = shmem_read_mapping_page(mapping, i);
> -			if (!IS_ERR(page)) {
> -				char *dst = kmap_atomic(page);
> -				memcpy(dst, vaddr, PAGE_SIZE);
> -				drm_clflush_virt_range(dst, PAGE_SIZE);
> -				kunmap_atomic(dst);
> -
> -				set_page_dirty(page);
> +			struct page *page;
> +			char *dst;
> +
> +			page = shmem_read_mapping_page(mapping, i);
> +			if (IS_ERR(page))
> +				continue;
> +
> +			dst = kmap_atomic(page);
> +			drm_clflush_virt_range(vaddr, PAGE_SIZE);
> +			memcpy(dst, vaddr, PAGE_SIZE);
> +			kunmap_atomic(dst);
> +
> +			set_page_dirty(page);
> +			if (obj->madv == I915_MADV_WILLNEED)
>  				mark_page_accessed(page);
> -				page_cache_release(page);
> -			}
> +			page_cache_release(page);
>  			vaddr += PAGE_SIZE;
>  		}
> -		i915_gem_chipset_flush(obj->base.dev);
> +		obj->dirty = 0;
>  	}
>  
> -#ifdef CONFIG_X86
> -	set_memory_wb((unsigned long)phys->vaddr, phys->size / PAGE_SIZE);
> -#endif
> -	drm_pci_free(obj->base.dev, phys);
> -	obj->phys_handle = NULL;
> +	sg_free_table(obj->pages);
> +	kfree(obj->pages);
> +
> +	obj->has_dma_mapping = false;
> +}
> +
> +static void
> +i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
> +{
> +	drm_pci_free(obj->base.dev, obj->phys_handle);
> +}
> +
> +static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
> +	.get_pages = i915_gem_object_get_pages_phys,
> +	.put_pages = i915_gem_object_put_pages_phys,
> +	.release = i915_gem_object_release_phys,
> +};
> +
> +static int
> +drop_pages(struct drm_i915_gem_object *obj)
> +{
> +	struct i915_vma *vma, *next;
> +	int ret;
> +
> +	drm_gem_object_reference(&obj->base);
> +	list_for_each_entry_safe(vma, next, &obj->vma_list, vma_link)
> +		if (i915_vma_unbind(vma))
> +			break;
> +
> +	ret = i915_gem_object_put_pages(obj);
> +	drm_gem_object_unreference(&obj->base);
> +
> +	return ret;
>  }
>  
>  int
> @@ -249,9 +346,7 @@ i915_gem_object_attach_phys(struct drm_i915_gem_object *obj,
>  			    int align)
>  {
>  	drm_dma_handle_t *phys;
> -	struct address_space *mapping;
> -	char *vaddr;
> -	int i;
> +	int ret;
>  
>  	if (obj->phys_handle) {
>  		if ((unsigned long)obj->phys_handle->vaddr & (align -1))
> @@ -266,41 +361,19 @@ i915_gem_object_attach_phys(struct drm_i915_gem_object *obj,
>  	if (obj->base.filp == NULL)
>  		return -EINVAL;
>  
> +	ret = drop_pages(obj);
> +	if (ret)
> +		return ret;
> +
>  	/* create a new object */
>  	phys = drm_pci_alloc(obj->base.dev, obj->base.size, align);
>  	if (!phys)
>  		return -ENOMEM;
>  
> -	vaddr = phys->vaddr;
> -#ifdef CONFIG_X86
> -	set_memory_wc((unsigned long)vaddr, phys->size / PAGE_SIZE);
> -#endif
> -	mapping = file_inode(obj->base.filp)->i_mapping;
> -	for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
> -		struct page *page;
> -		char *src;
> -
> -		page = shmem_read_mapping_page(mapping, i);
> -		if (IS_ERR(page)) {
> -#ifdef CONFIG_X86
> -			set_memory_wb((unsigned long)phys->vaddr, phys->size / PAGE_SIZE);
> -#endif
> -			drm_pci_free(obj->base.dev, phys);
> -			return PTR_ERR(page);
> -		}
> -
> -		src = kmap_atomic(page);
> -		memcpy(vaddr, src, PAGE_SIZE);
> -		kunmap_atomic(src);
> -
> -		mark_page_accessed(page);
> -		page_cache_release(page);
> -
> -		vaddr += PAGE_SIZE;
> -	}
> -
>  	obj->phys_handle = phys;
> -	return 0;
> +	obj->ops = &i915_gem_phys_ops;
> +
> +	return i915_gem_object_get_pages(obj);
>  }
>  
>  static int
> @@ -311,6 +384,14 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
>  	struct drm_device *dev = obj->base.dev;
>  	void *vaddr = obj->phys_handle->vaddr + args->offset;
>  	char __user *user_data = to_user_ptr(args->data_ptr);
> +	int ret;
> +
> +	/* We manually control the domain here and pretend that it
> +	 * remains coherent i.e. in the GTT domain, like shmem_pwrite.
> +	 */
> +	ret = i915_gem_object_wait_rendering(obj, false);
> +	if (ret)
> +		return ret;
>  
>  	if (__copy_from_user_inatomic_nocache(vaddr, user_data, args->size)) {
>  		unsigned long unwritten;
> @@ -326,6 +407,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
>  			return -EFAULT;
>  	}
>  
> +	drm_clflush_virt_range(vaddr, args->size);
>  	i915_gem_chipset_flush(dev);
>  	return 0;
>  }
> @@ -1046,11 +1128,6 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>  	 * pread/pwrite currently are reading and writing from the CPU
>  	 * perspective, requiring manual detiling by the client.
>  	 */
> -	if (obj->phys_handle) {
> -		ret = i915_gem_phys_pwrite(obj, args, file);
> -		goto out;
> -	}
> -
>  	if (obj->tiling_mode == I915_TILING_NONE &&
>  	    obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
>  	    cpu_write_needs_clflush(obj)) {
> @@ -1060,8 +1137,12 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>  		 * textures). Fallback to the shmem path in that case. */
>  	}
>  
> -	if (ret == -EFAULT || ret == -ENOSPC)
> -		ret = i915_gem_shmem_pwrite(dev, obj, args, file);
> +	if (ret == -EFAULT || ret == -ENOSPC) {
> +		if (obj->phys_handle)
> +			ret = i915_gem_phys_pwrite(obj, args, file);
> +		else
> +			ret = i915_gem_shmem_pwrite(dev, obj, args, file);
> +	}
>  
>  out:
>  	drm_gem_object_unreference(&obj->base);
> @@ -3560,7 +3641,7 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  	 * Stolen memory is always coherent with the GPU as it is explicitly
>  	 * marked as wc by the system, or the system is cache-coherent.
>  	 */
> -	if (obj->stolen)
> +	if (obj->stolen || obj->phys_handle)
>  		return false;
>  
>  	/* If the GPU is snooping the contents of the CPU cache,
> @@ -4495,8 +4576,6 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
>  		}
>  	}
>  
> -	i915_gem_object_detach_phys(obj);
> -
>  	/* Stolen objects don't hold a ref, but do hold pin count. Fix that up
>  	 * before progressing. */
>  	if (obj->stolen)
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index ff57f07..c6b229f 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -340,6 +340,7 @@ typedef struct drm_i915_irq_wait {
>  #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
>  #define I915_PARAM_HAS_WT     	 	 27
>  #define I915_PARAM_CMD_PARSER_VERSION	 28
> +#define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
>  
>  typedef struct drm_i915_getparam {
>  	int param;
> -- 
> 2.1.3
> 

> From ae9a40e0d04464796cc782d4531f386398e5266a Mon Sep 17 00:00:00 2001
> From: Chris Wilson <chris@chris-wilson.co.uk>
> Date: Mon, 13 Oct 2014 14:08:10 +0100
> Subject: [PATCH 2/3] drm/i915: Broaden application of set-domain(GTT)
> 
> Previously, this was restricted to only operate on bound objects - to
> make pointer access through the GTT to the object coherent with writes
> to and from the GPU. A second usecase is drm_intel_bo_wait_rendering()
> which at present does not function unless the object also happens to
> be bound into the GGTT (on current systems that is becoming increasingly
> rare, especially for the typical requests from mesa). A third usecase is
> a future patch wishing to extend the coverage of the GTT domain to
> include objects not bound into the GGTT but still in its coherent cache
> domain. For the latter pair of requests, we need to operate on the
> object regardless of its bind state.
> 
> v2: After discussion with Akash, we came to the conclusion that the
> get-pages was required in order for accurate domain tracking in the
> corner cases (like the shrinker) and also useful for ensuring memory
> coherency with earlier cached CPU mmaps in case userspace uses exotic
> cache bypass (non-temporal) instructions.
> 
> Cc: Akash Goel <akash.goel@intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Conflicts:
> 	drivers/gpu/drm/i915/i915_gem.c
> ---
>  drivers/gpu/drm/i915/i915_gem.c | 39 +++++++++++++++++++++------------------
>  1 file changed, 21 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 124ec85..fe119e1 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1490,18 +1490,10 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>  	if (ret)
>  		goto unref;
>  
> -	if (read_domains & I915_GEM_DOMAIN_GTT) {
> +	if (read_domains & I915_GEM_DOMAIN_GTT)
>  		ret = i915_gem_object_set_to_gtt_domain(obj, write_domain != 0);
> -
> -		/* Silently promote "you're not bound, there was nothing to do"
> -		 * to success, since the client was just asking us to
> -		 * make sure everything was done.
> -		 */
> -		if (ret == -EINVAL)
> -			ret = 0;
> -	} else {
> +	else
>  		ret = i915_gem_object_set_to_cpu_domain(obj, write_domain != 0);
> -	}
>  
>  unref:
>  	drm_gem_object_unreference(&obj->base);
> @@ -3722,15 +3714,10 @@ i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj,
>  int
>  i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
>  {
> -	struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
> -	struct i915_vma *vma = i915_gem_obj_to_ggtt(obj);
>  	uint32_t old_write_domain, old_read_domains;
> +	struct i915_vma *vma;
>  	int ret;
>  
> -	/* Not valid to be called on unbound objects. */
> -	if (vma == NULL)
> -		return -EINVAL;
> -
>  	if (obj->base.write_domain == I915_GEM_DOMAIN_GTT)
>  		return 0;
>  
> @@ -3739,6 +3726,19 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
>  		return ret;
>  
>  	i915_gem_object_retire(obj);
> +
> +	/* Flush and acquire obj->pages so that we are coherent through
> +	 * direct access in memory with previous cached writes through
> +	 * shmemfs and that our cache domain tracking remains valid.
> +	 * For example, if the obj->filp was moved to swap without us
> +	 * being notified and releasing the pages, we would mistakenly
> +	 * continue to assume that the obj remained out of the CPU cached
> +	 * domain.
> +	 */
> +	ret = i915_gem_object_get_pages(obj);
> +	if (ret)
> +		return ret;
> +
>  	i915_gem_object_flush_cpu_write_domain(obj, false);
>  
>  	/* Serialise direct access to this object with the barriers for
> @@ -3770,9 +3770,12 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
>  					    old_write_domain);
>  
>  	/* And bump the LRU for this access */
> -	if (i915_gem_object_is_inactive(obj))
> +	vma = i915_gem_obj_to_ggtt(obj);
> +	if (vma &&
> +	    drm_mm_node_allocated(&vma->node) &&
> +	    i915_gem_object_is_inactive(obj))
>  		list_move_tail(&vma->mm_list,
> -			       &dev_priv->gtt.base.inactive_list);
> +			       &to_i915(obj->base.dev)->gtt.base.inactive_list);
>  
>  	return 0;
>  }
> -- 
> 2.1.3
> 

> From 4882abcb71b4982371a1aad038d7565887138ee5 Mon Sep 17 00:00:00 2001
> From: Akash Goel <akash.goel@intel.com>
> Date: Thu, 23 Oct 2014 17:55:47 +0100
> Subject: [PATCH 3/3] drm/i915: Support creation of unbound wc user mappings
>  for objects
> 
> This patch provides support to create write-combining virtual mappings of
> GEM object. It intends to provide the same funtionality of 'mmap_gtt'
> interface without the constraints and contention of a limited aperture
> space, but requires clients handles the linear to tile conversion on their
> own. This is for improving the CPU write operation performance, as with such
> mapping, writes and reads are almost 50% faster than with mmap_gtt. Similar
> to the GTT mmapping, unlike the regular CPU mmapping, it avoids the cache
> flush after update from CPU side, when object is passed onto GPU.  This
> type of mapping is specially useful in case of sub-region update,
> i.e. when only a portion of the object is to be updated. Using a CPU mmap
> in such cases would normally incur a clflush of the whole object, and
> using a GTT mmapping would likely require eviction of an active object or
> fence and thus stall. The write-combining CPU mmap avoids both.
> 
> To ensure the cache coherency, before using this mapping, the GTT domain
> has been reused here. This provides the required cache flush if the object
> is in CPU domain or synchronization against the concurrent rendering.
> Although the access through an uncached mmap should automatically
> invalidate the cache lines, this may not be true for non-temporal write
> instructions and also not all pages of the object may be updated at any
> given point of time through this mapping.  Having a call to get_pages in
> set_to_gtt_domain function, as added in the earlier patch 'drm/i915:
> Broaden application of set-domain(GTT)', would guarantee the clflush and
> so there will be no cachelines holding the data for the object before it
> is accessed through this map.
> 
> The drm_i915_gem_mmap structure (for the DRM_I915_GEM_MMAP_IOCTL) has been
> extended with a new flags field (defaulting to 0 for existent users). In
> order for userspace to detect the extended ioctl, a new parameter
> I915_PARAM_HAS_EXT_MMAP has been added for versioning the ioctl interface.
> 
> v2: Fix error handling, invalid flag detection, renaming (ickle)
> v3: Adapt to fit "drm/i915: Make the physical object coherent with GTT" (dileks)
> 
> The new mmapping is exercised by igt/gem_mmap_wc,
> igt/gem_concurrent_blit and igt/gem_gtt_speed.
> 
> Change-Id: Ie883942f9e689525f72fe9a8d3780c3a9faa769a
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |  3 +++
>  drivers/gpu/drm/i915/i915_gem.c | 19 +++++++++++++++++++
>  include/uapi/drm/i915_drm.h     |  9 +++++++++
>  3 files changed, 31 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 9b08853..c034cfc 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1030,6 +1030,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
>  		value = 1;
>  		break;
> +	case I915_PARAM_MMAP_VERSION:
> +		value = 1;
> +		break;
>  	default:
>  		DRM_DEBUG("Unknown parameter %d\n", param->param);
>  		return -EINVAL;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index fe119e1..48d0bbc 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1548,6 +1548,12 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
>  	struct drm_gem_object *obj;
>  	unsigned long addr;
>  
> +	if (args->flags & ~(I915_MMAP_WC))
> +		return -EINVAL;
> +
> +	if (args->flags & I915_MMAP_WC && !cpu_has_pat)
> +		return -ENODEV;
> +
>  	obj = drm_gem_object_lookup(dev, file, args->handle);
>  	if (obj == NULL)
>  		return -ENOENT;
> @@ -1563,6 +1569,19 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
>  	addr = vm_mmap(obj->filp, 0, args->size,
>  		       PROT_READ | PROT_WRITE, MAP_SHARED,
>  		       args->offset);
> +	if (args->flags & I915_MMAP_WC) {
> +		struct mm_struct *mm = current->mm;
> +		struct vm_area_struct *vma;
> +
> +		down_write(&mm->mmap_sem);
> +		vma = find_vma(mm, addr);
> +		if (vma)
> +			vma->vm_page_prot =
> +				pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
> +		else
> +			addr = -ENOMEM;
> +		up_write(&mm->mmap_sem);
> +	}
>  	drm_gem_object_unreference_unlocked(obj);
>  	if (IS_ERR((void *)addr))
>  		return addr;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index c6b229f..f7eaefa 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -341,6 +341,7 @@ typedef struct drm_i915_irq_wait {
>  #define I915_PARAM_HAS_WT     	 	 27
>  #define I915_PARAM_CMD_PARSER_VERSION	 28
>  #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
> +#define I915_PARAM_MMAP_VERSION          30
>  
>  typedef struct drm_i915_getparam {
>  	int param;
> @@ -488,6 +489,14 @@ struct drm_i915_gem_mmap {
>  	 * This is a fixed-size type for 32/64 compatibility.
>  	 */
>  	__u64 addr_ptr;
> +
> +	/**
> +	 * Flags for extended behaviour.
> +	 *
> +	 * Added in version 2.
> +	 */
> +	__u64 flags;
> +#define I915_MMAP_WC 0x1
>  };
>  
>  struct drm_i915_gem_mmap_gtt {
> -- 
> 2.1.3
> 

> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Sedat Dilek Nov. 14, 2014, 10:16 a.m. UTC | #2
On Fri, Nov 14, 2014 at 11:04 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Thu, Nov 13, 2014 at 04:28:46PM +0100, Sedat Dilek wrote:
>> Hi,
>>
>> what is the status of drm-intel-wc-mmap patchset (#2 + #3)?
>> I have refreshed them on top of drm-intel-coherent-phys-gtt patch (#1).
>> Playing with that againt Linux v3.18-rc4.
>
> Waiting for the misssing testcases and remainig reviews. I tried to
> volunteer Akash for that, but apparently failed.

Hiho Daniel :-),

what can you recommend for testing gfx performance?
Chris did some remark about only 4M(ops) with x11perf.

What are your experiences with diverse compilers?
Here I use a prebuilt-toolchain LLVM/Clang v3.4.2 toolchain from
<llvm.org>, this is my default compiler.
Also I have here a gcc v4.9.2 in a /opt/gcc-4.9 environment.
Default in Ubuntu/precise is gcc v4.6.3.

Note: All my /opt/xorg stuff (currently: libdrm | mesa3d | intelddx)
is compiled with LLVM.

Has intel-gpu-tools some sort of performance testing included?

Any other hints qre very much appreciated.

Thanks!

Regards,
- Sedat -

> -Daniel
>
>>
>> Regards,
>> - Sedat -
>
>> From 20a70ef5865104f35bc8e4cd11ca8ae3b7e6051a Mon Sep 17 00:00:00 2001
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>> Date: Tue, 4 Nov 2014 04:51:40 -0800
>> Subject: [PATCH 1/3] drm/i915: Make the physical object coherent with GTT
>> MIME-Version: 1.0
>> Content-Type: text/plain; charset=UTF-8
>> Content-Transfer-Encoding: 8bit
>>
>> Currently objects for which the hardware needs a contiguous physical
>> address are allocated a shadow backing storage to satisfy the contraint.
>> This shadow buffer is not wired into the normal obj->pages and so the
>> physical object is incoherent with accesses via the GPU, GTT and CPU. By
>> setting up the appropriate scatter-gather table, we can allow userspace
>> to access the physical object via either a GTT mmaping of or by rendering
>> into the GEM bo. However, keeping the CPU mmap of the shmemfs backing
>> storage coherent with the contiguous shadow is not yet possible.
>> Fortuituously, CPU mmaps of objects requiring physical addresses are not
>> expected to be coherent anyway.
>>
>> This allows the physical constraint of the GEM object to be transparent
>> to userspace and allow it to efficiently render into or update them via
>> the GTT and GPU.
>>
>> v2: Fix leak of pci handle spotted by Ville
>> v3: Remove the now duplicate call to detach_phys_object during free.
>> v4: Wait for rendering before pwrite. As this patch makes it possible to
>> render into the phys object, we should make it correct as well!
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
>> ---
>>  drivers/gpu/drm/i915/i915_dma.c |   3 +
>>  drivers/gpu/drm/i915/i915_drv.h |   6 +-
>>  drivers/gpu/drm/i915/i915_gem.c | 207 +++++++++++++++++++++++++++-------------
>>  include/uapi/drm/i915_drm.h     |   1 +
>>  4 files changed, 150 insertions(+), 67 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
>> index 1403b01..9b08853 100644
>> --- a/drivers/gpu/drm/i915/i915_dma.c
>> +++ b/drivers/gpu/drm/i915/i915_dma.c
>> @@ -1027,6 +1027,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>>       case I915_PARAM_CMD_PARSER_VERSION:
>>               value = i915_cmd_parser_get_version();
>>               break;
>> +     case I915_PARAM_HAS_COHERENT_PHYS_GTT:
>> +             value = 1;
>> +             break;
>>       default:
>>               DRM_DEBUG("Unknown parameter %d\n", param->param);
>>               return -EINVAL;
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 16a6f6d..0417784 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1864,10 +1864,10 @@ struct drm_i915_gem_object {
>>       unsigned long user_pin_count;
>>       struct drm_file *pin_filp;
>>
>> -     /** for phy allocated objects */
>> -     struct drm_dma_handle *phys_handle;
>> -
>>       union {
>> +             /** for phy allocated objects */
>> +             struct drm_dma_handle *phys_handle;
>> +
>>               struct i915_gem_userptr {
>>                       uintptr_t ptr;
>>                       unsigned read_only :1;
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 28f91df..124ec85 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -208,40 +208,137 @@ i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
>>       return 0;
>>  }
>>
>> -static void i915_gem_object_detach_phys(struct drm_i915_gem_object *obj)
>> +static int
>> +i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
>>  {
>> -     drm_dma_handle_t *phys = obj->phys_handle;
>> +     struct address_space *mapping = file_inode(obj->base.filp)->i_mapping;
>> +     char *vaddr = obj->phys_handle->vaddr;
>> +     struct sg_table *st;
>> +     struct scatterlist *sg;
>> +     int i;
>>
>> -     if (!phys)
>> -             return;
>> +     if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
>> +             return -EINVAL;
>> +
>> +     for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
>> +             struct page *page;
>> +             char *src;
>> +
>> +             page = shmem_read_mapping_page(mapping, i);
>> +             if (IS_ERR(page))
>> +                     return PTR_ERR(page);
>> +
>> +             src = kmap_atomic(page);
>> +             memcpy(vaddr, src, PAGE_SIZE);
>> +             drm_clflush_virt_range(vaddr, PAGE_SIZE);
>> +             kunmap_atomic(src);
>> +
>> +             page_cache_release(page);
>> +             vaddr += PAGE_SIZE;
>> +     }
>> +
>> +     i915_gem_chipset_flush(obj->base.dev);
>> +
>> +     st = kmalloc(sizeof(*st), GFP_KERNEL);
>> +     if (st == NULL)
>> +             return -ENOMEM;
>> +
>> +     if (sg_alloc_table(st, 1, GFP_KERNEL)) {
>> +             kfree(st);
>> +             return -ENOMEM;
>> +     }
>> +
>> +     sg = st->sgl;
>> +     sg->offset = 0;
>> +     sg->length = obj->base.size;
>>
>> -     if (obj->madv == I915_MADV_WILLNEED) {
>> +     sg_dma_address(sg) = obj->phys_handle->busaddr;
>> +     sg_dma_len(sg) = obj->base.size;
>> +
>> +     obj->pages = st;
>> +     obj->has_dma_mapping = true;
>> +     return 0;
>> +}
>> +
>> +static void
>> +i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj)
>> +{
>> +     int ret;
>> +
>> +     BUG_ON(obj->madv == __I915_MADV_PURGED);
>> +
>> +     ret = i915_gem_object_set_to_cpu_domain(obj, true);
>> +     if (ret) {
>> +             /* In the event of a disaster, abandon all caches and
>> +              * hope for the best.
>> +              */
>> +             WARN_ON(ret != -EIO);
>> +             obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>> +     }
>> +
>> +     if (obj->madv == I915_MADV_DONTNEED)
>> +             obj->dirty = 0;
>> +
>> +     if (obj->dirty) {
>>               struct address_space *mapping = file_inode(obj->base.filp)->i_mapping;
>> -             char *vaddr = phys->vaddr;
>> +             char *vaddr = obj->phys_handle->vaddr;
>>               int i;
>>
>>               for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
>> -                     struct page *page = shmem_read_mapping_page(mapping, i);
>> -                     if (!IS_ERR(page)) {
>> -                             char *dst = kmap_atomic(page);
>> -                             memcpy(dst, vaddr, PAGE_SIZE);
>> -                             drm_clflush_virt_range(dst, PAGE_SIZE);
>> -                             kunmap_atomic(dst);
>> -
>> -                             set_page_dirty(page);
>> +                     struct page *page;
>> +                     char *dst;
>> +
>> +                     page = shmem_read_mapping_page(mapping, i);
>> +                     if (IS_ERR(page))
>> +                             continue;
>> +
>> +                     dst = kmap_atomic(page);
>> +                     drm_clflush_virt_range(vaddr, PAGE_SIZE);
>> +                     memcpy(dst, vaddr, PAGE_SIZE);
>> +                     kunmap_atomic(dst);
>> +
>> +                     set_page_dirty(page);
>> +                     if (obj->madv == I915_MADV_WILLNEED)
>>                               mark_page_accessed(page);
>> -                             page_cache_release(page);
>> -                     }
>> +                     page_cache_release(page);
>>                       vaddr += PAGE_SIZE;
>>               }
>> -             i915_gem_chipset_flush(obj->base.dev);
>> +             obj->dirty = 0;
>>       }
>>
>> -#ifdef CONFIG_X86
>> -     set_memory_wb((unsigned long)phys->vaddr, phys->size / PAGE_SIZE);
>> -#endif
>> -     drm_pci_free(obj->base.dev, phys);
>> -     obj->phys_handle = NULL;
>> +     sg_free_table(obj->pages);
>> +     kfree(obj->pages);
>> +
>> +     obj->has_dma_mapping = false;
>> +}
>> +
>> +static void
>> +i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
>> +{
>> +     drm_pci_free(obj->base.dev, obj->phys_handle);
>> +}
>> +
>> +static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
>> +     .get_pages = i915_gem_object_get_pages_phys,
>> +     .put_pages = i915_gem_object_put_pages_phys,
>> +     .release = i915_gem_object_release_phys,
>> +};
>> +
>> +static int
>> +drop_pages(struct drm_i915_gem_object *obj)
>> +{
>> +     struct i915_vma *vma, *next;
>> +     int ret;
>> +
>> +     drm_gem_object_reference(&obj->base);
>> +     list_for_each_entry_safe(vma, next, &obj->vma_list, vma_link)
>> +             if (i915_vma_unbind(vma))
>> +                     break;
>> +
>> +     ret = i915_gem_object_put_pages(obj);
>> +     drm_gem_object_unreference(&obj->base);
>> +
>> +     return ret;
>>  }
>>
>>  int
>> @@ -249,9 +346,7 @@ i915_gem_object_attach_phys(struct drm_i915_gem_object *obj,
>>                           int align)
>>  {
>>       drm_dma_handle_t *phys;
>> -     struct address_space *mapping;
>> -     char *vaddr;
>> -     int i;
>> +     int ret;
>>
>>       if (obj->phys_handle) {
>>               if ((unsigned long)obj->phys_handle->vaddr & (align -1))
>> @@ -266,41 +361,19 @@ i915_gem_object_attach_phys(struct drm_i915_gem_object *obj,
>>       if (obj->base.filp == NULL)
>>               return -EINVAL;
>>
>> +     ret = drop_pages(obj);
>> +     if (ret)
>> +             return ret;
>> +
>>       /* create a new object */
>>       phys = drm_pci_alloc(obj->base.dev, obj->base.size, align);
>>       if (!phys)
>>               return -ENOMEM;
>>
>> -     vaddr = phys->vaddr;
>> -#ifdef CONFIG_X86
>> -     set_memory_wc((unsigned long)vaddr, phys->size / PAGE_SIZE);
>> -#endif
>> -     mapping = file_inode(obj->base.filp)->i_mapping;
>> -     for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
>> -             struct page *page;
>> -             char *src;
>> -
>> -             page = shmem_read_mapping_page(mapping, i);
>> -             if (IS_ERR(page)) {
>> -#ifdef CONFIG_X86
>> -                     set_memory_wb((unsigned long)phys->vaddr, phys->size / PAGE_SIZE);
>> -#endif
>> -                     drm_pci_free(obj->base.dev, phys);
>> -                     return PTR_ERR(page);
>> -             }
>> -
>> -             src = kmap_atomic(page);
>> -             memcpy(vaddr, src, PAGE_SIZE);
>> -             kunmap_atomic(src);
>> -
>> -             mark_page_accessed(page);
>> -             page_cache_release(page);
>> -
>> -             vaddr += PAGE_SIZE;
>> -     }
>> -
>>       obj->phys_handle = phys;
>> -     return 0;
>> +     obj->ops = &i915_gem_phys_ops;
>> +
>> +     return i915_gem_object_get_pages(obj);
>>  }
>>
>>  static int
>> @@ -311,6 +384,14 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
>>       struct drm_device *dev = obj->base.dev;
>>       void *vaddr = obj->phys_handle->vaddr + args->offset;
>>       char __user *user_data = to_user_ptr(args->data_ptr);
>> +     int ret;
>> +
>> +     /* We manually control the domain here and pretend that it
>> +      * remains coherent i.e. in the GTT domain, like shmem_pwrite.
>> +      */
>> +     ret = i915_gem_object_wait_rendering(obj, false);
>> +     if (ret)
>> +             return ret;
>>
>>       if (__copy_from_user_inatomic_nocache(vaddr, user_data, args->size)) {
>>               unsigned long unwritten;
>> @@ -326,6 +407,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
>>                       return -EFAULT;
>>       }
>>
>> +     drm_clflush_virt_range(vaddr, args->size);
>>       i915_gem_chipset_flush(dev);
>>       return 0;
>>  }
>> @@ -1046,11 +1128,6 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>>        * pread/pwrite currently are reading and writing from the CPU
>>        * perspective, requiring manual detiling by the client.
>>        */
>> -     if (obj->phys_handle) {
>> -             ret = i915_gem_phys_pwrite(obj, args, file);
>> -             goto out;
>> -     }
>> -
>>       if (obj->tiling_mode == I915_TILING_NONE &&
>>           obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
>>           cpu_write_needs_clflush(obj)) {
>> @@ -1060,8 +1137,12 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>>                * textures). Fallback to the shmem path in that case. */
>>       }
>>
>> -     if (ret == -EFAULT || ret == -ENOSPC)
>> -             ret = i915_gem_shmem_pwrite(dev, obj, args, file);
>> +     if (ret == -EFAULT || ret == -ENOSPC) {
>> +             if (obj->phys_handle)
>> +                     ret = i915_gem_phys_pwrite(obj, args, file);
>> +             else
>> +                     ret = i915_gem_shmem_pwrite(dev, obj, args, file);
>> +     }
>>
>>  out:
>>       drm_gem_object_unreference(&obj->base);
>> @@ -3560,7 +3641,7 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>>        * Stolen memory is always coherent with the GPU as it is explicitly
>>        * marked as wc by the system, or the system is cache-coherent.
>>        */
>> -     if (obj->stolen)
>> +     if (obj->stolen || obj->phys_handle)
>>               return false;
>>
>>       /* If the GPU is snooping the contents of the CPU cache,
>> @@ -4495,8 +4576,6 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
>>               }
>>       }
>>
>> -     i915_gem_object_detach_phys(obj);
>> -
>>       /* Stolen objects don't hold a ref, but do hold pin count. Fix that up
>>        * before progressing. */
>>       if (obj->stolen)
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index ff57f07..c6b229f 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -340,6 +340,7 @@ typedef struct drm_i915_irq_wait {
>>  #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
>>  #define I915_PARAM_HAS_WT                     27
>>  #define I915_PARAM_CMD_PARSER_VERSION         28
>> +#define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
>>
>>  typedef struct drm_i915_getparam {
>>       int param;
>> --
>> 2.1.3
>>
>
>> From ae9a40e0d04464796cc782d4531f386398e5266a Mon Sep 17 00:00:00 2001
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>> Date: Mon, 13 Oct 2014 14:08:10 +0100
>> Subject: [PATCH 2/3] drm/i915: Broaden application of set-domain(GTT)
>>
>> Previously, this was restricted to only operate on bound objects - to
>> make pointer access through the GTT to the object coherent with writes
>> to and from the GPU. A second usecase is drm_intel_bo_wait_rendering()
>> which at present does not function unless the object also happens to
>> be bound into the GGTT (on current systems that is becoming increasingly
>> rare, especially for the typical requests from mesa). A third usecase is
>> a future patch wishing to extend the coverage of the GTT domain to
>> include objects not bound into the GGTT but still in its coherent cache
>> domain. For the latter pair of requests, we need to operate on the
>> object regardless of its bind state.
>>
>> v2: After discussion with Akash, we came to the conclusion that the
>> get-pages was required in order for accurate domain tracking in the
>> corner cases (like the shrinker) and also useful for ensuring memory
>> coherency with earlier cached CPU mmaps in case userspace uses exotic
>> cache bypass (non-temporal) instructions.
>>
>> Cc: Akash Goel <akash.goel@intel.com>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> Conflicts:
>>       drivers/gpu/drm/i915/i915_gem.c
>> ---
>>  drivers/gpu/drm/i915/i915_gem.c | 39 +++++++++++++++++++++------------------
>>  1 file changed, 21 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 124ec85..fe119e1 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -1490,18 +1490,10 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>>       if (ret)
>>               goto unref;
>>
>> -     if (read_domains & I915_GEM_DOMAIN_GTT) {
>> +     if (read_domains & I915_GEM_DOMAIN_GTT)
>>               ret = i915_gem_object_set_to_gtt_domain(obj, write_domain != 0);
>> -
>> -             /* Silently promote "you're not bound, there was nothing to do"
>> -              * to success, since the client was just asking us to
>> -              * make sure everything was done.
>> -              */
>> -             if (ret == -EINVAL)
>> -                     ret = 0;
>> -     } else {
>> +     else
>>               ret = i915_gem_object_set_to_cpu_domain(obj, write_domain != 0);
>> -     }
>>
>>  unref:
>>       drm_gem_object_unreference(&obj->base);
>> @@ -3722,15 +3714,10 @@ i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj,
>>  int
>>  i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
>>  {
>> -     struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
>> -     struct i915_vma *vma = i915_gem_obj_to_ggtt(obj);
>>       uint32_t old_write_domain, old_read_domains;
>> +     struct i915_vma *vma;
>>       int ret;
>>
>> -     /* Not valid to be called on unbound objects. */
>> -     if (vma == NULL)
>> -             return -EINVAL;
>> -
>>       if (obj->base.write_domain == I915_GEM_DOMAIN_GTT)
>>               return 0;
>>
>> @@ -3739,6 +3726,19 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
>>               return ret;
>>
>>       i915_gem_object_retire(obj);
>> +
>> +     /* Flush and acquire obj->pages so that we are coherent through
>> +      * direct access in memory with previous cached writes through
>> +      * shmemfs and that our cache domain tracking remains valid.
>> +      * For example, if the obj->filp was moved to swap without us
>> +      * being notified and releasing the pages, we would mistakenly
>> +      * continue to assume that the obj remained out of the CPU cached
>> +      * domain.
>> +      */
>> +     ret = i915_gem_object_get_pages(obj);
>> +     if (ret)
>> +             return ret;
>> +
>>       i915_gem_object_flush_cpu_write_domain(obj, false);
>>
>>       /* Serialise direct access to this object with the barriers for
>> @@ -3770,9 +3770,12 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
>>                                           old_write_domain);
>>
>>       /* And bump the LRU for this access */
>> -     if (i915_gem_object_is_inactive(obj))
>> +     vma = i915_gem_obj_to_ggtt(obj);
>> +     if (vma &&
>> +         drm_mm_node_allocated(&vma->node) &&
>> +         i915_gem_object_is_inactive(obj))
>>               list_move_tail(&vma->mm_list,
>> -                            &dev_priv->gtt.base.inactive_list);
>> +                            &to_i915(obj->base.dev)->gtt.base.inactive_list);
>>
>>       return 0;
>>  }
>> --
>> 2.1.3
>>
>
>> From 4882abcb71b4982371a1aad038d7565887138ee5 Mon Sep 17 00:00:00 2001
>> From: Akash Goel <akash.goel@intel.com>
>> Date: Thu, 23 Oct 2014 17:55:47 +0100
>> Subject: [PATCH 3/3] drm/i915: Support creation of unbound wc user mappings
>>  for objects
>>
>> This patch provides support to create write-combining virtual mappings of
>> GEM object. It intends to provide the same funtionality of 'mmap_gtt'
>> interface without the constraints and contention of a limited aperture
>> space, but requires clients handles the linear to tile conversion on their
>> own. This is for improving the CPU write operation performance, as with such
>> mapping, writes and reads are almost 50% faster than with mmap_gtt. Similar
>> to the GTT mmapping, unlike the regular CPU mmapping, it avoids the cache
>> flush after update from CPU side, when object is passed onto GPU.  This
>> type of mapping is specially useful in case of sub-region update,
>> i.e. when only a portion of the object is to be updated. Using a CPU mmap
>> in such cases would normally incur a clflush of the whole object, and
>> using a GTT mmapping would likely require eviction of an active object or
>> fence and thus stall. The write-combining CPU mmap avoids both.
>>
>> To ensure the cache coherency, before using this mapping, the GTT domain
>> has been reused here. This provides the required cache flush if the object
>> is in CPU domain or synchronization against the concurrent rendering.
>> Although the access through an uncached mmap should automatically
>> invalidate the cache lines, this may not be true for non-temporal write
>> instructions and also not all pages of the object may be updated at any
>> given point of time through this mapping.  Having a call to get_pages in
>> set_to_gtt_domain function, as added in the earlier patch 'drm/i915:
>> Broaden application of set-domain(GTT)', would guarantee the clflush and
>> so there will be no cachelines holding the data for the object before it
>> is accessed through this map.
>>
>> The drm_i915_gem_mmap structure (for the DRM_I915_GEM_MMAP_IOCTL) has been
>> extended with a new flags field (defaulting to 0 for existent users). In
>> order for userspace to detect the extended ioctl, a new parameter
>> I915_PARAM_HAS_EXT_MMAP has been added for versioning the ioctl interface.
>>
>> v2: Fix error handling, invalid flag detection, renaming (ickle)
>> v3: Adapt to fit "drm/i915: Make the physical object coherent with GTT" (dileks)
>>
>> The new mmapping is exercised by igt/gem_mmap_wc,
>> igt/gem_concurrent_blit and igt/gem_gtt_speed.
>>
>> Change-Id: Ie883942f9e689525f72fe9a8d3780c3a9faa769a
>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>> ---
>>  drivers/gpu/drm/i915/i915_dma.c |  3 +++
>>  drivers/gpu/drm/i915/i915_gem.c | 19 +++++++++++++++++++
>>  include/uapi/drm/i915_drm.h     |  9 +++++++++
>>  3 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
>> index 9b08853..c034cfc 100644
>> --- a/drivers/gpu/drm/i915/i915_dma.c
>> +++ b/drivers/gpu/drm/i915/i915_dma.c
>> @@ -1030,6 +1030,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>>       case I915_PARAM_HAS_COHERENT_PHYS_GTT:
>>               value = 1;
>>               break;
>> +     case I915_PARAM_MMAP_VERSION:
>> +             value = 1;
>> +             break;
>>       default:
>>               DRM_DEBUG("Unknown parameter %d\n", param->param);
>>               return -EINVAL;
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index fe119e1..48d0bbc 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -1548,6 +1548,12 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
>>       struct drm_gem_object *obj;
>>       unsigned long addr;
>>
>> +     if (args->flags & ~(I915_MMAP_WC))
>> +             return -EINVAL;
>> +
>> +     if (args->flags & I915_MMAP_WC && !cpu_has_pat)
>> +             return -ENODEV;
>> +
>>       obj = drm_gem_object_lookup(dev, file, args->handle);
>>       if (obj == NULL)
>>               return -ENOENT;
>> @@ -1563,6 +1569,19 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
>>       addr = vm_mmap(obj->filp, 0, args->size,
>>                      PROT_READ | PROT_WRITE, MAP_SHARED,
>>                      args->offset);
>> +     if (args->flags & I915_MMAP_WC) {
>> +             struct mm_struct *mm = current->mm;
>> +             struct vm_area_struct *vma;
>> +
>> +             down_write(&mm->mmap_sem);
>> +             vma = find_vma(mm, addr);
>> +             if (vma)
>> +                     vma->vm_page_prot =
>> +                             pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
>> +             else
>> +                     addr = -ENOMEM;
>> +             up_write(&mm->mmap_sem);
>> +     }
>>       drm_gem_object_unreference_unlocked(obj);
>>       if (IS_ERR((void *)addr))
>>               return addr;
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index c6b229f..f7eaefa 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -341,6 +341,7 @@ typedef struct drm_i915_irq_wait {
>>  #define I915_PARAM_HAS_WT                     27
>>  #define I915_PARAM_CMD_PARSER_VERSION         28
>>  #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
>> +#define I915_PARAM_MMAP_VERSION          30
>>
>>  typedef struct drm_i915_getparam {
>>       int param;
>> @@ -488,6 +489,14 @@ struct drm_i915_gem_mmap {
>>        * This is a fixed-size type for 32/64 compatibility.
>>        */
>>       __u64 addr_ptr;
>> +
>> +     /**
>> +      * Flags for extended behaviour.
>> +      *
>> +      * Added in version 2.
>> +      */
>> +     __u64 flags;
>> +#define I915_MMAP_WC 0x1
>>  };
>>
>>  struct drm_i915_gem_mmap_gtt {
>> --
>> 2.1.3
>>
>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
Sedat Dilek Feb. 3, 2015, 6:57 p.m. UTC | #3
Ping?

- Sedat -

On Fri, Nov 14, 2014 at 11:04 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Thu, Nov 13, 2014 at 04:28:46PM +0100, Sedat Dilek wrote:
>> Hi,
>>
>> what is the status of drm-intel-wc-mmap patchset (#2 + #3)?
>> I have refreshed them on top of drm-intel-coherent-phys-gtt patch (#1).
>> Playing with that againt Linux v3.18-rc4.
>
> Waiting for the misssing testcases and remainig reviews. I tried to
> volunteer Akash for that, but apparently failed.
> -Daniel
>
>>
>> Regards,
>> - Sedat -
>
>> From 20a70ef5865104f35bc8e4cd11ca8ae3b7e6051a Mon Sep 17 00:00:00 2001
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>> Date: Tue, 4 Nov 2014 04:51:40 -0800
>> Subject: [PATCH 1/3] drm/i915: Make the physical object coherent with GTT
>> MIME-Version: 1.0
>> Content-Type: text/plain; charset=UTF-8
>> Content-Transfer-Encoding: 8bit
>>
>> Currently objects for which the hardware needs a contiguous physical
>> address are allocated a shadow backing storage to satisfy the contraint.
>> This shadow buffer is not wired into the normal obj->pages and so the
>> physical object is incoherent with accesses via the GPU, GTT and CPU. By
>> setting up the appropriate scatter-gather table, we can allow userspace
>> to access the physical object via either a GTT mmaping of or by rendering
>> into the GEM bo. However, keeping the CPU mmap of the shmemfs backing
>> storage coherent with the contiguous shadow is not yet possible.
>> Fortuituously, CPU mmaps of objects requiring physical addresses are not
>> expected to be coherent anyway.
>>
>> This allows the physical constraint of the GEM object to be transparent
>> to userspace and allow it to efficiently render into or update them via
>> the GTT and GPU.
>>
>> v2: Fix leak of pci handle spotted by Ville
>> v3: Remove the now duplicate call to detach_phys_object during free.
>> v4: Wait for rendering before pwrite. As this patch makes it possible to
>> render into the phys object, we should make it correct as well!
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
>> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
>> ---
>>  drivers/gpu/drm/i915/i915_dma.c |   3 +
>>  drivers/gpu/drm/i915/i915_drv.h |   6 +-
>>  drivers/gpu/drm/i915/i915_gem.c | 207 +++++++++++++++++++++++++++-------------
>>  include/uapi/drm/i915_drm.h     |   1 +
>>  4 files changed, 150 insertions(+), 67 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
>> index 1403b01..9b08853 100644
>> --- a/drivers/gpu/drm/i915/i915_dma.c
>> +++ b/drivers/gpu/drm/i915/i915_dma.c
>> @@ -1027,6 +1027,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>>       case I915_PARAM_CMD_PARSER_VERSION:
>>               value = i915_cmd_parser_get_version();
>>               break;
>> +     case I915_PARAM_HAS_COHERENT_PHYS_GTT:
>> +             value = 1;
>> +             break;
>>       default:
>>               DRM_DEBUG("Unknown parameter %d\n", param->param);
>>               return -EINVAL;
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 16a6f6d..0417784 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1864,10 +1864,10 @@ struct drm_i915_gem_object {
>>       unsigned long user_pin_count;
>>       struct drm_file *pin_filp;
>>
>> -     /** for phy allocated objects */
>> -     struct drm_dma_handle *phys_handle;
>> -
>>       union {
>> +             /** for phy allocated objects */
>> +             struct drm_dma_handle *phys_handle;
>> +
>>               struct i915_gem_userptr {
>>                       uintptr_t ptr;
>>                       unsigned read_only :1;
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 28f91df..124ec85 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -208,40 +208,137 @@ i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
>>       return 0;
>>  }
>>
>> -static void i915_gem_object_detach_phys(struct drm_i915_gem_object *obj)
>> +static int
>> +i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
>>  {
>> -     drm_dma_handle_t *phys = obj->phys_handle;
>> +     struct address_space *mapping = file_inode(obj->base.filp)->i_mapping;
>> +     char *vaddr = obj->phys_handle->vaddr;
>> +     struct sg_table *st;
>> +     struct scatterlist *sg;
>> +     int i;
>>
>> -     if (!phys)
>> -             return;
>> +     if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
>> +             return -EINVAL;
>> +
>> +     for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
>> +             struct page *page;
>> +             char *src;
>> +
>> +             page = shmem_read_mapping_page(mapping, i);
>> +             if (IS_ERR(page))
>> +                     return PTR_ERR(page);
>> +
>> +             src = kmap_atomic(page);
>> +             memcpy(vaddr, src, PAGE_SIZE);
>> +             drm_clflush_virt_range(vaddr, PAGE_SIZE);
>> +             kunmap_atomic(src);
>> +
>> +             page_cache_release(page);
>> +             vaddr += PAGE_SIZE;
>> +     }
>> +
>> +     i915_gem_chipset_flush(obj->base.dev);
>> +
>> +     st = kmalloc(sizeof(*st), GFP_KERNEL);
>> +     if (st == NULL)
>> +             return -ENOMEM;
>> +
>> +     if (sg_alloc_table(st, 1, GFP_KERNEL)) {
>> +             kfree(st);
>> +             return -ENOMEM;
>> +     }
>> +
>> +     sg = st->sgl;
>> +     sg->offset = 0;
>> +     sg->length = obj->base.size;
>>
>> -     if (obj->madv == I915_MADV_WILLNEED) {
>> +     sg_dma_address(sg) = obj->phys_handle->busaddr;
>> +     sg_dma_len(sg) = obj->base.size;
>> +
>> +     obj->pages = st;
>> +     obj->has_dma_mapping = true;
>> +     return 0;
>> +}
>> +
>> +static void
>> +i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj)
>> +{
>> +     int ret;
>> +
>> +     BUG_ON(obj->madv == __I915_MADV_PURGED);
>> +
>> +     ret = i915_gem_object_set_to_cpu_domain(obj, true);
>> +     if (ret) {
>> +             /* In the event of a disaster, abandon all caches and
>> +              * hope for the best.
>> +              */
>> +             WARN_ON(ret != -EIO);
>> +             obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>> +     }
>> +
>> +     if (obj->madv == I915_MADV_DONTNEED)
>> +             obj->dirty = 0;
>> +
>> +     if (obj->dirty) {
>>               struct address_space *mapping = file_inode(obj->base.filp)->i_mapping;
>> -             char *vaddr = phys->vaddr;
>> +             char *vaddr = obj->phys_handle->vaddr;
>>               int i;
>>
>>               for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
>> -                     struct page *page = shmem_read_mapping_page(mapping, i);
>> -                     if (!IS_ERR(page)) {
>> -                             char *dst = kmap_atomic(page);
>> -                             memcpy(dst, vaddr, PAGE_SIZE);
>> -                             drm_clflush_virt_range(dst, PAGE_SIZE);
>> -                             kunmap_atomic(dst);
>> -
>> -                             set_page_dirty(page);
>> +                     struct page *page;
>> +                     char *dst;
>> +
>> +                     page = shmem_read_mapping_page(mapping, i);
>> +                     if (IS_ERR(page))
>> +                             continue;
>> +
>> +                     dst = kmap_atomic(page);
>> +                     drm_clflush_virt_range(vaddr, PAGE_SIZE);
>> +                     memcpy(dst, vaddr, PAGE_SIZE);
>> +                     kunmap_atomic(dst);
>> +
>> +                     set_page_dirty(page);
>> +                     if (obj->madv == I915_MADV_WILLNEED)
>>                               mark_page_accessed(page);
>> -                             page_cache_release(page);
>> -                     }
>> +                     page_cache_release(page);
>>                       vaddr += PAGE_SIZE;
>>               }
>> -             i915_gem_chipset_flush(obj->base.dev);
>> +             obj->dirty = 0;
>>       }
>>
>> -#ifdef CONFIG_X86
>> -     set_memory_wb((unsigned long)phys->vaddr, phys->size / PAGE_SIZE);
>> -#endif
>> -     drm_pci_free(obj->base.dev, phys);
>> -     obj->phys_handle = NULL;
>> +     sg_free_table(obj->pages);
>> +     kfree(obj->pages);
>> +
>> +     obj->has_dma_mapping = false;
>> +}
>> +
>> +static void
>> +i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
>> +{
>> +     drm_pci_free(obj->base.dev, obj->phys_handle);
>> +}
>> +
>> +static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
>> +     .get_pages = i915_gem_object_get_pages_phys,
>> +     .put_pages = i915_gem_object_put_pages_phys,
>> +     .release = i915_gem_object_release_phys,
>> +};
>> +
>> +static int
>> +drop_pages(struct drm_i915_gem_object *obj)
>> +{
>> +     struct i915_vma *vma, *next;
>> +     int ret;
>> +
>> +     drm_gem_object_reference(&obj->base);
>> +     list_for_each_entry_safe(vma, next, &obj->vma_list, vma_link)
>> +             if (i915_vma_unbind(vma))
>> +                     break;
>> +
>> +     ret = i915_gem_object_put_pages(obj);
>> +     drm_gem_object_unreference(&obj->base);
>> +
>> +     return ret;
>>  }
>>
>>  int
>> @@ -249,9 +346,7 @@ i915_gem_object_attach_phys(struct drm_i915_gem_object *obj,
>>                           int align)
>>  {
>>       drm_dma_handle_t *phys;
>> -     struct address_space *mapping;
>> -     char *vaddr;
>> -     int i;
>> +     int ret;
>>
>>       if (obj->phys_handle) {
>>               if ((unsigned long)obj->phys_handle->vaddr & (align -1))
>> @@ -266,41 +361,19 @@ i915_gem_object_attach_phys(struct drm_i915_gem_object *obj,
>>       if (obj->base.filp == NULL)
>>               return -EINVAL;
>>
>> +     ret = drop_pages(obj);
>> +     if (ret)
>> +             return ret;
>> +
>>       /* create a new object */
>>       phys = drm_pci_alloc(obj->base.dev, obj->base.size, align);
>>       if (!phys)
>>               return -ENOMEM;
>>
>> -     vaddr = phys->vaddr;
>> -#ifdef CONFIG_X86
>> -     set_memory_wc((unsigned long)vaddr, phys->size / PAGE_SIZE);
>> -#endif
>> -     mapping = file_inode(obj->base.filp)->i_mapping;
>> -     for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
>> -             struct page *page;
>> -             char *src;
>> -
>> -             page = shmem_read_mapping_page(mapping, i);
>> -             if (IS_ERR(page)) {
>> -#ifdef CONFIG_X86
>> -                     set_memory_wb((unsigned long)phys->vaddr, phys->size / PAGE_SIZE);
>> -#endif
>> -                     drm_pci_free(obj->base.dev, phys);
>> -                     return PTR_ERR(page);
>> -             }
>> -
>> -             src = kmap_atomic(page);
>> -             memcpy(vaddr, src, PAGE_SIZE);
>> -             kunmap_atomic(src);
>> -
>> -             mark_page_accessed(page);
>> -             page_cache_release(page);
>> -
>> -             vaddr += PAGE_SIZE;
>> -     }
>> -
>>       obj->phys_handle = phys;
>> -     return 0;
>> +     obj->ops = &i915_gem_phys_ops;
>> +
>> +     return i915_gem_object_get_pages(obj);
>>  }
>>
>>  static int
>> @@ -311,6 +384,14 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
>>       struct drm_device *dev = obj->base.dev;
>>       void *vaddr = obj->phys_handle->vaddr + args->offset;
>>       char __user *user_data = to_user_ptr(args->data_ptr);
>> +     int ret;
>> +
>> +     /* We manually control the domain here and pretend that it
>> +      * remains coherent i.e. in the GTT domain, like shmem_pwrite.
>> +      */
>> +     ret = i915_gem_object_wait_rendering(obj, false);
>> +     if (ret)
>> +             return ret;
>>
>>       if (__copy_from_user_inatomic_nocache(vaddr, user_data, args->size)) {
>>               unsigned long unwritten;
>> @@ -326,6 +407,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
>>                       return -EFAULT;
>>       }
>>
>> +     drm_clflush_virt_range(vaddr, args->size);
>>       i915_gem_chipset_flush(dev);
>>       return 0;
>>  }
>> @@ -1046,11 +1128,6 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>>        * pread/pwrite currently are reading and writing from the CPU
>>        * perspective, requiring manual detiling by the client.
>>        */
>> -     if (obj->phys_handle) {
>> -             ret = i915_gem_phys_pwrite(obj, args, file);
>> -             goto out;
>> -     }
>> -
>>       if (obj->tiling_mode == I915_TILING_NONE &&
>>           obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
>>           cpu_write_needs_clflush(obj)) {
>> @@ -1060,8 +1137,12 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>>                * textures). Fallback to the shmem path in that case. */
>>       }
>>
>> -     if (ret == -EFAULT || ret == -ENOSPC)
>> -             ret = i915_gem_shmem_pwrite(dev, obj, args, file);
>> +     if (ret == -EFAULT || ret == -ENOSPC) {
>> +             if (obj->phys_handle)
>> +                     ret = i915_gem_phys_pwrite(obj, args, file);
>> +             else
>> +                     ret = i915_gem_shmem_pwrite(dev, obj, args, file);
>> +     }
>>
>>  out:
>>       drm_gem_object_unreference(&obj->base);
>> @@ -3560,7 +3641,7 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>>        * Stolen memory is always coherent with the GPU as it is explicitly
>>        * marked as wc by the system, or the system is cache-coherent.
>>        */
>> -     if (obj->stolen)
>> +     if (obj->stolen || obj->phys_handle)
>>               return false;
>>
>>       /* If the GPU is snooping the contents of the CPU cache,
>> @@ -4495,8 +4576,6 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
>>               }
>>       }
>>
>> -     i915_gem_object_detach_phys(obj);
>> -
>>       /* Stolen objects don't hold a ref, but do hold pin count. Fix that up
>>        * before progressing. */
>>       if (obj->stolen)
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index ff57f07..c6b229f 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -340,6 +340,7 @@ typedef struct drm_i915_irq_wait {
>>  #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
>>  #define I915_PARAM_HAS_WT                     27
>>  #define I915_PARAM_CMD_PARSER_VERSION         28
>> +#define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
>>
>>  typedef struct drm_i915_getparam {
>>       int param;
>> --
>> 2.1.3
>>
>
>> From ae9a40e0d04464796cc782d4531f386398e5266a Mon Sep 17 00:00:00 2001
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>> Date: Mon, 13 Oct 2014 14:08:10 +0100
>> Subject: [PATCH 2/3] drm/i915: Broaden application of set-domain(GTT)
>>
>> Previously, this was restricted to only operate on bound objects - to
>> make pointer access through the GTT to the object coherent with writes
>> to and from the GPU. A second usecase is drm_intel_bo_wait_rendering()
>> which at present does not function unless the object also happens to
>> be bound into the GGTT (on current systems that is becoming increasingly
>> rare, especially for the typical requests from mesa). A third usecase is
>> a future patch wishing to extend the coverage of the GTT domain to
>> include objects not bound into the GGTT but still in its coherent cache
>> domain. For the latter pair of requests, we need to operate on the
>> object regardless of its bind state.
>>
>> v2: After discussion with Akash, we came to the conclusion that the
>> get-pages was required in order for accurate domain tracking in the
>> corner cases (like the shrinker) and also useful for ensuring memory
>> coherency with earlier cached CPU mmaps in case userspace uses exotic
>> cache bypass (non-temporal) instructions.
>>
>> Cc: Akash Goel <akash.goel@intel.com>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> Conflicts:
>>       drivers/gpu/drm/i915/i915_gem.c
>> ---
>>  drivers/gpu/drm/i915/i915_gem.c | 39 +++++++++++++++++++++------------------
>>  1 file changed, 21 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 124ec85..fe119e1 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -1490,18 +1490,10 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>>       if (ret)
>>               goto unref;
>>
>> -     if (read_domains & I915_GEM_DOMAIN_GTT) {
>> +     if (read_domains & I915_GEM_DOMAIN_GTT)
>>               ret = i915_gem_object_set_to_gtt_domain(obj, write_domain != 0);
>> -
>> -             /* Silently promote "you're not bound, there was nothing to do"
>> -              * to success, since the client was just asking us to
>> -              * make sure everything was done.
>> -              */
>> -             if (ret == -EINVAL)
>> -                     ret = 0;
>> -     } else {
>> +     else
>>               ret = i915_gem_object_set_to_cpu_domain(obj, write_domain != 0);
>> -     }
>>
>>  unref:
>>       drm_gem_object_unreference(&obj->base);
>> @@ -3722,15 +3714,10 @@ i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj,
>>  int
>>  i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
>>  {
>> -     struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
>> -     struct i915_vma *vma = i915_gem_obj_to_ggtt(obj);
>>       uint32_t old_write_domain, old_read_domains;
>> +     struct i915_vma *vma;
>>       int ret;
>>
>> -     /* Not valid to be called on unbound objects. */
>> -     if (vma == NULL)
>> -             return -EINVAL;
>> -
>>       if (obj->base.write_domain == I915_GEM_DOMAIN_GTT)
>>               return 0;
>>
>> @@ -3739,6 +3726,19 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
>>               return ret;
>>
>>       i915_gem_object_retire(obj);
>> +
>> +     /* Flush and acquire obj->pages so that we are coherent through
>> +      * direct access in memory with previous cached writes through
>> +      * shmemfs and that our cache domain tracking remains valid.
>> +      * For example, if the obj->filp was moved to swap without us
>> +      * being notified and releasing the pages, we would mistakenly
>> +      * continue to assume that the obj remained out of the CPU cached
>> +      * domain.
>> +      */
>> +     ret = i915_gem_object_get_pages(obj);
>> +     if (ret)
>> +             return ret;
>> +
>>       i915_gem_object_flush_cpu_write_domain(obj, false);
>>
>>       /* Serialise direct access to this object with the barriers for
>> @@ -3770,9 +3770,12 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
>>                                           old_write_domain);
>>
>>       /* And bump the LRU for this access */
>> -     if (i915_gem_object_is_inactive(obj))
>> +     vma = i915_gem_obj_to_ggtt(obj);
>> +     if (vma &&
>> +         drm_mm_node_allocated(&vma->node) &&
>> +         i915_gem_object_is_inactive(obj))
>>               list_move_tail(&vma->mm_list,
>> -                            &dev_priv->gtt.base.inactive_list);
>> +                            &to_i915(obj->base.dev)->gtt.base.inactive_list);
>>
>>       return 0;
>>  }
>> --
>> 2.1.3
>>
>
>> From 4882abcb71b4982371a1aad038d7565887138ee5 Mon Sep 17 00:00:00 2001
>> From: Akash Goel <akash.goel@intel.com>
>> Date: Thu, 23 Oct 2014 17:55:47 +0100
>> Subject: [PATCH 3/3] drm/i915: Support creation of unbound wc user mappings
>>  for objects
>>
>> This patch provides support to create write-combining virtual mappings of
>> GEM object. It intends to provide the same funtionality of 'mmap_gtt'
>> interface without the constraints and contention of a limited aperture
>> space, but requires clients handles the linear to tile conversion on their
>> own. This is for improving the CPU write operation performance, as with such
>> mapping, writes and reads are almost 50% faster than with mmap_gtt. Similar
>> to the GTT mmapping, unlike the regular CPU mmapping, it avoids the cache
>> flush after update from CPU side, when object is passed onto GPU.  This
>> type of mapping is specially useful in case of sub-region update,
>> i.e. when only a portion of the object is to be updated. Using a CPU mmap
>> in such cases would normally incur a clflush of the whole object, and
>> using a GTT mmapping would likely require eviction of an active object or
>> fence and thus stall. The write-combining CPU mmap avoids both.
>>
>> To ensure the cache coherency, before using this mapping, the GTT domain
>> has been reused here. This provides the required cache flush if the object
>> is in CPU domain or synchronization against the concurrent rendering.
>> Although the access through an uncached mmap should automatically
>> invalidate the cache lines, this may not be true for non-temporal write
>> instructions and also not all pages of the object may be updated at any
>> given point of time through this mapping.  Having a call to get_pages in
>> set_to_gtt_domain function, as added in the earlier patch 'drm/i915:
>> Broaden application of set-domain(GTT)', would guarantee the clflush and
>> so there will be no cachelines holding the data for the object before it
>> is accessed through this map.
>>
>> The drm_i915_gem_mmap structure (for the DRM_I915_GEM_MMAP_IOCTL) has been
>> extended with a new flags field (defaulting to 0 for existent users). In
>> order for userspace to detect the extended ioctl, a new parameter
>> I915_PARAM_HAS_EXT_MMAP has been added for versioning the ioctl interface.
>>
>> v2: Fix error handling, invalid flag detection, renaming (ickle)
>> v3: Adapt to fit "drm/i915: Make the physical object coherent with GTT" (dileks)
>>
>> The new mmapping is exercised by igt/gem_mmap_wc,
>> igt/gem_concurrent_blit and igt/gem_gtt_speed.
>>
>> Change-Id: Ie883942f9e689525f72fe9a8d3780c3a9faa769a
>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>> ---
>>  drivers/gpu/drm/i915/i915_dma.c |  3 +++
>>  drivers/gpu/drm/i915/i915_gem.c | 19 +++++++++++++++++++
>>  include/uapi/drm/i915_drm.h     |  9 +++++++++
>>  3 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
>> index 9b08853..c034cfc 100644
>> --- a/drivers/gpu/drm/i915/i915_dma.c
>> +++ b/drivers/gpu/drm/i915/i915_dma.c
>> @@ -1030,6 +1030,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>>       case I915_PARAM_HAS_COHERENT_PHYS_GTT:
>>               value = 1;
>>               break;
>> +     case I915_PARAM_MMAP_VERSION:
>> +             value = 1;
>> +             break;
>>       default:
>>               DRM_DEBUG("Unknown parameter %d\n", param->param);
>>               return -EINVAL;
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index fe119e1..48d0bbc 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -1548,6 +1548,12 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
>>       struct drm_gem_object *obj;
>>       unsigned long addr;
>>
>> +     if (args->flags & ~(I915_MMAP_WC))
>> +             return -EINVAL;
>> +
>> +     if (args->flags & I915_MMAP_WC && !cpu_has_pat)
>> +             return -ENODEV;
>> +
>>       obj = drm_gem_object_lookup(dev, file, args->handle);
>>       if (obj == NULL)
>>               return -ENOENT;
>> @@ -1563,6 +1569,19 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
>>       addr = vm_mmap(obj->filp, 0, args->size,
>>                      PROT_READ | PROT_WRITE, MAP_SHARED,
>>                      args->offset);
>> +     if (args->flags & I915_MMAP_WC) {
>> +             struct mm_struct *mm = current->mm;
>> +             struct vm_area_struct *vma;
>> +
>> +             down_write(&mm->mmap_sem);
>> +             vma = find_vma(mm, addr);
>> +             if (vma)
>> +                     vma->vm_page_prot =
>> +                             pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
>> +             else
>> +                     addr = -ENOMEM;
>> +             up_write(&mm->mmap_sem);
>> +     }
>>       drm_gem_object_unreference_unlocked(obj);
>>       if (IS_ERR((void *)addr))
>>               return addr;
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index c6b229f..f7eaefa 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -341,6 +341,7 @@ typedef struct drm_i915_irq_wait {
>>  #define I915_PARAM_HAS_WT                     27
>>  #define I915_PARAM_CMD_PARSER_VERSION         28
>>  #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
>> +#define I915_PARAM_MMAP_VERSION          30
>>
>>  typedef struct drm_i915_getparam {
>>       int param;
>> @@ -488,6 +489,14 @@ struct drm_i915_gem_mmap {
>>        * This is a fixed-size type for 32/64 compatibility.
>>        */
>>       __u64 addr_ptr;
>> +
>> +     /**
>> +      * Flags for extended behaviour.
>> +      *
>> +      * Added in version 2.
>> +      */
>> +     __u64 flags;
>> +#define I915_MMAP_WC 0x1
>>  };
>>
>>  struct drm_i915_gem_mmap_gtt {
>> --
>> 2.1.3
>>
>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
Chris Wilson Feb. 3, 2015, 7:24 p.m. UTC | #4
On Tue, Feb 03, 2015 at 07:57:34PM +0100, Sedat Dilek wrote:
> Ping?

It's upstream and enabled by default for kernel v3.20+.
-Chris
Sedat Dilek Feb. 3, 2015, 7:44 p.m. UTC | #5
On Tue, Feb 3, 2015 at 8:24 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Tue, Feb 03, 2015 at 07:57:34PM +0100, Sedat Dilek wrote:
>> Ping?
>
> It's upstream and enabled by default for kernel v3.20+.
>

Thanks, I have looked into Linux-next (next-20150203) and see both patches:

commit 43566dedde54f9729113f5f9fde77d53e75e61e9
"drm/i915: Broaden application of set-domain(GTT)"

commit 1816f92363036600f2387bb8273b1e5e1f5b304e
"drm/i915: Support creation of unbound wc user mappings for objects"

- Sedat -

[1] http://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/?id=43566dedde54f9729113f5f9fde77d53e75e61e9
[2] http://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/?id=1816f92363036600f2387bb8273b1e5e1f5b304e
diff mbox

Patch

From 4882abcb71b4982371a1aad038d7565887138ee5 Mon Sep 17 00:00:00 2001
From: Akash Goel <akash.goel@intel.com>
Date: Thu, 23 Oct 2014 17:55:47 +0100
Subject: [PATCH 3/3] drm/i915: Support creation of unbound wc user mappings
 for objects

This patch provides support to create write-combining virtual mappings of
GEM object. It intends to provide the same funtionality of 'mmap_gtt'
interface without the constraints and contention of a limited aperture
space, but requires clients handles the linear to tile conversion on their
own. This is for improving the CPU write operation performance, as with such
mapping, writes and reads are almost 50% faster than with mmap_gtt. Similar
to the GTT mmapping, unlike the regular CPU mmapping, it avoids the cache
flush after update from CPU side, when object is passed onto GPU.  This
type of mapping is specially useful in case of sub-region update,
i.e. when only a portion of the object is to be updated. Using a CPU mmap
in such cases would normally incur a clflush of the whole object, and
using a GTT mmapping would likely require eviction of an active object or
fence and thus stall. The write-combining CPU mmap avoids both.

To ensure the cache coherency, before using this mapping, the GTT domain
has been reused here. This provides the required cache flush if the object
is in CPU domain or synchronization against the concurrent rendering.
Although the access through an uncached mmap should automatically
invalidate the cache lines, this may not be true for non-temporal write
instructions and also not all pages of the object may be updated at any
given point of time through this mapping.  Having a call to get_pages in
set_to_gtt_domain function, as added in the earlier patch 'drm/i915:
Broaden application of set-domain(GTT)', would guarantee the clflush and
so there will be no cachelines holding the data for the object before it
is accessed through this map.

The drm_i915_gem_mmap structure (for the DRM_I915_GEM_MMAP_IOCTL) has been
extended with a new flags field (defaulting to 0 for existent users). In
order for userspace to detect the extended ioctl, a new parameter
I915_PARAM_HAS_EXT_MMAP has been added for versioning the ioctl interface.

v2: Fix error handling, invalid flag detection, renaming (ickle)
v3: Adapt to fit "drm/i915: Make the physical object coherent with GTT" (dileks)

The new mmapping is exercised by igt/gem_mmap_wc,
igt/gem_concurrent_blit and igt/gem_gtt_speed.

Change-Id: Ie883942f9e689525f72fe9a8d3780c3a9faa769a
Signed-off-by: Akash Goel <akash.goel@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_dma.c |  3 +++
 drivers/gpu/drm/i915/i915_gem.c | 19 +++++++++++++++++++
 include/uapi/drm/i915_drm.h     |  9 +++++++++
 3 files changed, 31 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 9b08853..c034cfc 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1030,6 +1030,9 @@  static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 		value = 1;
 		break;
+	case I915_PARAM_MMAP_VERSION:
+		value = 1;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fe119e1..48d0bbc 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1548,6 +1548,12 @@  i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 	struct drm_gem_object *obj;
 	unsigned long addr;
 
+	if (args->flags & ~(I915_MMAP_WC))
+		return -EINVAL;
+
+	if (args->flags & I915_MMAP_WC && !cpu_has_pat)
+		return -ENODEV;
+
 	obj = drm_gem_object_lookup(dev, file, args->handle);
 	if (obj == NULL)
 		return -ENOENT;
@@ -1563,6 +1569,19 @@  i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 	addr = vm_mmap(obj->filp, 0, args->size,
 		       PROT_READ | PROT_WRITE, MAP_SHARED,
 		       args->offset);
+	if (args->flags & I915_MMAP_WC) {
+		struct mm_struct *mm = current->mm;
+		struct vm_area_struct *vma;
+
+		down_write(&mm->mmap_sem);
+		vma = find_vma(mm, addr);
+		if (vma)
+			vma->vm_page_prot =
+				pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+		else
+			addr = -ENOMEM;
+		up_write(&mm->mmap_sem);
+	}
 	drm_gem_object_unreference_unlocked(obj);
 	if (IS_ERR((void *)addr))
 		return addr;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index c6b229f..f7eaefa 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -341,6 +341,7 @@  typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_WT     	 	 27
 #define I915_PARAM_CMD_PARSER_VERSION	 28
 #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
+#define I915_PARAM_MMAP_VERSION          30
 
 typedef struct drm_i915_getparam {
 	int param;
@@ -488,6 +489,14 @@  struct drm_i915_gem_mmap {
 	 * This is a fixed-size type for 32/64 compatibility.
 	 */
 	__u64 addr_ptr;
+
+	/**
+	 * Flags for extended behaviour.
+	 *
+	 * Added in version 2.
+	 */
+	__u64 flags;
+#define I915_MMAP_WC 0x1
 };
 
 struct drm_i915_gem_mmap_gtt {
-- 
2.1.3