diff mbox series

vulkan: Add VK_EXT_calibrated_timestamps extension (radv and anv) [v4]

Message ID 20181016053150.11453-1-keithp@keithp.com
State New, archived
Headers show
Series vulkan: Add VK_EXT_calibrated_timestamps extension (radv and anv) [v4] | expand

Commit Message

Keith Packard Oct. 16, 2018, 5:31 a.m. UTC
Offers three clocks, device, clock monotonic and clock monotonic
raw. Could use some kernel support to reduce the deviation between
clock values.

v2:
	Ensure deviation is at least as big as the GPU time interval.

v3:
	Set device->lost when returning DEVICE_LOST.
	Use MAX2 and DIV_ROUND_UP instead of open coding these.
	Delete spurious TIMESTAMP in radv version.
	Suggested-by: Jason Ekstrand <jason@jlekstrand.net>
	Suggested-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>

v4:
	Add anv_gem_reg_read to anv_gem_stubs.c
	Suggested-by: Jason Ekstrand <jason@jlekstrand.net>

Signed-off-by: Keith Packard <keithp@keithp.com>
---
 src/amd/vulkan/radv_device.c       | 81 +++++++++++++++++++++++++++
 src/amd/vulkan/radv_extensions.py  |  1 +
 src/intel/vulkan/anv_device.c      | 89 ++++++++++++++++++++++++++++++
 src/intel/vulkan/anv_extensions.py |  1 +
 src/intel/vulkan/anv_gem.c         | 13 +++++
 src/intel/vulkan/anv_gem_stubs.c   |  7 +++
 src/intel/vulkan/anv_private.h     |  2 +
 7 files changed, 194 insertions(+)

Comments

Samuel Pitoiset Oct. 16, 2018, 7:32 a.m. UTC | #1
The RADV bits are:

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>

Thanks!

On 10/16/18 7:31 AM, Keith Packard wrote:
> Offers three clocks, device, clock monotonic and clock monotonic
> raw. Could use some kernel support to reduce the deviation between
> clock values.
> 
> v2:
> 	Ensure deviation is at least as big as the GPU time interval.
> 
> v3:
> 	Set device->lost when returning DEVICE_LOST.
> 	Use MAX2 and DIV_ROUND_UP instead of open coding these.
> 	Delete spurious TIMESTAMP in radv version.
> 	Suggested-by: Jason Ekstrand <jason@jlekstrand.net>
> 	Suggested-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> 
> v4:
> 	Add anv_gem_reg_read to anv_gem_stubs.c
> 	Suggested-by: Jason Ekstrand <jason@jlekstrand.net>
> 
> Signed-off-by: Keith Packard <keithp@keithp.com>
> ---
>   src/amd/vulkan/radv_device.c       | 81 +++++++++++++++++++++++++++
>   src/amd/vulkan/radv_extensions.py  |  1 +
>   src/intel/vulkan/anv_device.c      | 89 ++++++++++++++++++++++++++++++
>   src/intel/vulkan/anv_extensions.py |  1 +
>   src/intel/vulkan/anv_gem.c         | 13 +++++
>   src/intel/vulkan/anv_gem_stubs.c   |  7 +++
>   src/intel/vulkan/anv_private.h     |  2 +
>   7 files changed, 194 insertions(+)
> 
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 174922780fc..80050485e54 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -4955,3 +4955,84 @@ radv_GetDeviceGroupPeerMemoryFeatures(
>   	                       VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
>   	                       VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
>   }
> +
> +static const VkTimeDomainEXT radv_time_domains[] = {
> +	VK_TIME_DOMAIN_DEVICE_EXT,
> +	VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
> +	VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT,
> +};
> +
> +VkResult radv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
> +	VkPhysicalDevice                             physicalDevice,
> +	uint32_t                                     *pTimeDomainCount,
> +	VkTimeDomainEXT                              *pTimeDomains)
> +{
> +	int d;
> +	VK_OUTARRAY_MAKE(out, pTimeDomains, pTimeDomainCount);
> +
> +	for (d = 0; d < ARRAY_SIZE(radv_time_domains); d++) {
> +		vk_outarray_append(&out, i) {
> +			*i = radv_time_domains[d];
> +		}
> +	}
> +
> +	return vk_outarray_status(&out);
> +}
> +
> +static uint64_t
> +radv_clock_gettime(clockid_t clock_id)
> +{
> +	struct timespec current;
> +	int ret;
> +
> +	ret = clock_gettime(clock_id, &current);
> +	if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW)
> +		ret = clock_gettime(CLOCK_MONOTONIC, &current);
> +	if (ret < 0)
> +		return 0;
> +
> +	return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec;
> +}
> +
> +VkResult radv_GetCalibratedTimestampsEXT(
> +	VkDevice                                     _device,
> +	uint32_t                                     timestampCount,
> +	const VkCalibratedTimestampInfoEXT           *pTimestampInfos,
> +	uint64_t                                     *pTimestamps,
> +	uint64_t                                     *pMaxDeviation)
> +{
> +	RADV_FROM_HANDLE(radv_device, device, _device);
> +	uint32_t clock_crystal_freq = device->physical_device->rad_info.clock_crystal_freq;
> +	int d;
> +	uint64_t begin, end;
> +
> +	begin = radv_clock_gettime(CLOCK_MONOTONIC_RAW);
> +
> +	for (d = 0; d < timestampCount; d++) {
> +		switch (pTimestampInfos[d].timeDomain) {
> +		case VK_TIME_DOMAIN_DEVICE_EXT:
> +			pTimestamps[d] = device->ws->query_value(device->ws,
> +								 RADEON_TIMESTAMP);
> +			break;
> +		case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
> +			pTimestamps[d] = radv_clock_gettime(CLOCK_MONOTONIC);
> +			break;
> +
> +		case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
> +			pTimestamps[d] = begin;
> +			break;
> +		default:
> +			pTimestamps[d] = 0;
> +			break;
> +		}
> +	}
> +
> +	end = radv_clock_gettime(CLOCK_MONOTONIC_RAW);
> +
> +	uint64_t clock_period = end - begin;
> +	uint64_t device_period = DIV_ROUND_UP(1000000, clock_crystal_freq);
> +
> +	*pMaxDeviation = MAX2(clock_period, device_period);
> +
> +	return VK_SUCCESS;
> +}
> diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
> index 5dcedae1c63..4c81d3f0068 100644
> --- a/src/amd/vulkan/radv_extensions.py
> +++ b/src/amd/vulkan/radv_extensions.py
> @@ -92,6 +92,7 @@ EXTENSIONS = [
>       Extension('VK_KHR_display',                          23, 'VK_USE_PLATFORM_DISPLAY_KHR'),
>       Extension('VK_EXT_direct_mode_display',               1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
>       Extension('VK_EXT_acquire_xlib_display',              1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
> +    Extension('VK_EXT_calibrated_timestamps',             1, True),
>       Extension('VK_EXT_conditional_rendering',             1, True),
>       Extension('VK_EXT_conservative_rasterization',        1, 'device->rad_info.chip_class >= GFX9'),
>       Extension('VK_EXT_display_surface_counter',           1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
> index a2551452eb1..249d7f36f0f 100644
> --- a/src/intel/vulkan/anv_device.c
> +++ b/src/intel/vulkan/anv_device.c
> @@ -3021,6 +3021,95 @@ void anv_DestroyFramebuffer(
>      vk_free2(&device->alloc, pAllocator, fb);
>   }
>   
> +static const VkTimeDomainEXT anv_time_domains[] = {
> +   VK_TIME_DOMAIN_DEVICE_EXT,
> +   VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
> +   VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT,
> +};
> +
> +VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
> +   VkPhysicalDevice                             physicalDevice,
> +   uint32_t                                     *pTimeDomainCount,
> +   VkTimeDomainEXT                              *pTimeDomains)
> +{
> +   int d;
> +   VK_OUTARRAY_MAKE(out, pTimeDomains, pTimeDomainCount);
> +
> +   for (d = 0; d < ARRAY_SIZE(anv_time_domains); d++) {
> +      vk_outarray_append(&out, i) {
> +         *i = anv_time_domains[d];
> +      }
> +   }
> +
> +   return vk_outarray_status(&out);
> +}
> +
> +static uint64_t
> +anv_clock_gettime(clockid_t clock_id)
> +{
> +   struct timespec current;
> +   int ret;
> +
> +   ret = clock_gettime(clock_id, &current);
> +   if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW)
> +      ret = clock_gettime(CLOCK_MONOTONIC, &current);
> +   if (ret < 0)
> +      return 0;
> +
> +   return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec;
> +}
> +
> +#define TIMESTAMP 0x2358
> +
> +VkResult anv_GetCalibratedTimestampsEXT(
> +   VkDevice                                     _device,
> +   uint32_t                                     timestampCount,
> +   const VkCalibratedTimestampInfoEXT           *pTimestampInfos,
> +   uint64_t                                     *pTimestamps,
> +   uint64_t                                     *pMaxDeviation)
> +{
> +   ANV_FROM_HANDLE(anv_device, device, _device);
> +   uint64_t timestamp_frequency = device->info.timestamp_frequency;
> +   int  ret;
> +   int d;
> +   uint64_t begin, end;
> +
> +   begin = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
> +
> +   for (d = 0; d < timestampCount; d++) {
> +      switch (pTimestampInfos[d].timeDomain) {
> +      case VK_TIME_DOMAIN_DEVICE_EXT:
> +         ret = anv_gem_reg_read(device, TIMESTAMP | 1,
> +                                &pTimestamps[d]);
> +
> +         if (ret != 0) {
> +            device->lost = TRUE;
> +            return VK_ERROR_DEVICE_LOST;
> +         }
> +         break;
> +      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
> +         pTimestamps[d] = anv_clock_gettime(CLOCK_MONOTONIC);
> +         break;
> +
> +      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
> +         pTimestamps[d] = begin;
> +         break;
> +      default:
> +         pTimestamps[d] = 0;
> +         break;
> +      }
> +   }
> +
> +   end = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
> +
> +   uint64_t clock_period = end - begin;
> +   uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
> +
> +   *pMaxDeviation = MAX2(clock_period, device_period);
> +
> +   return VK_SUCCESS;
> +}
> +
>   /* vk_icd.h does not declare this function, so we declare it here to
>    * suppress Wmissing-prototypes.
>    */
> diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
> index d4915c95013..a8535964da7 100644
> --- a/src/intel/vulkan/anv_extensions.py
> +++ b/src/intel/vulkan/anv_extensions.py
> @@ -126,6 +126,7 @@ EXTENSIONS = [
>       Extension('VK_EXT_vertex_attribute_divisor',          3, True),
>       Extension('VK_EXT_post_depth_coverage',               1, 'device->info.gen >= 9'),
>       Extension('VK_EXT_sampler_filter_minmax',             1, 'device->info.gen >= 9'),
> +    Extension('VK_EXT_calibrated_timestamps',             1, True),
>   ]
>   
>   class VkVersion:
> diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c
> index c43b5ef9e06..1bdf040c1a3 100644
> --- a/src/intel/vulkan/anv_gem.c
> +++ b/src/intel/vulkan/anv_gem.c
> @@ -423,6 +423,19 @@ anv_gem_fd_to_handle(struct anv_device *device, int fd)
>      return args.handle;
>   }
>   
> +int
> +anv_gem_reg_read(struct anv_device *device, uint32_t offset, uint64_t *result)
> +{
> +   struct drm_i915_reg_read args = {
> +      .offset = offset
> +   };
> +
> +   int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_REG_READ, &args);
> +
> +   *result = args.val;
> +   return ret;
> +}
> +
>   #ifndef SYNC_IOC_MAGIC
>   /* duplicated from linux/sync_file.h to avoid build-time dependency
>    * on new (v4.7) kernel headers.  Once distro's are mostly using
> diff --git a/src/intel/vulkan/anv_gem_stubs.c b/src/intel/vulkan/anv_gem_stubs.c
> index 5093bd5db1a..8cc3ad1f22e 100644
> --- a/src/intel/vulkan/anv_gem_stubs.c
> +++ b/src/intel/vulkan/anv_gem_stubs.c
> @@ -251,3 +251,10 @@ anv_gem_syncobj_wait(struct anv_device *device,
>   {
>      unreachable("Unused");
>   }
> +
> +int
> +anv_gem_reg_read(struct anv_device *device,
> +                 uint32_t offset, uint64_t *result)
> +{
> +   unreachable("Unused");
> +}
> diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
> index 599b903f25c..08376b00c8e 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1103,6 +1103,8 @@ int anv_gem_get_aperture(int fd, uint64_t *size);
>   int anv_gem_gpu_get_reset_stats(struct anv_device *device,
>                                   uint32_t *active, uint32_t *pending);
>   int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
> +int anv_gem_reg_read(struct anv_device *device,
> +                     uint32_t offset, uint64_t *result);
>   uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
>   int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
>   int anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
>
Bas Nieuwenhuizen Oct. 16, 2018, 7:33 a.m. UTC | #2
On Tue, Oct 16, 2018 at 7:31 AM Keith Packard <keithp@keithp.com> wrote:
>
> Offers three clocks, device, clock monotonic and clock monotonic
> raw. Could use some kernel support to reduce the deviation between
> clock values.
>
> v2:
>         Ensure deviation is at least as big as the GPU time interval.
>
> v3:
>         Set device->lost when returning DEVICE_LOST.
>         Use MAX2 and DIV_ROUND_UP instead of open coding these.
>         Delete spurious TIMESTAMP in radv version.
>         Suggested-by: Jason Ekstrand <jason@jlekstrand.net>
>         Suggested-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>
> v4:
>         Add anv_gem_reg_read to anv_gem_stubs.c
>         Suggested-by: Jason Ekstrand <jason@jlekstrand.net>
>
> Signed-off-by: Keith Packard <keithp@keithp.com>
> ---
>  src/amd/vulkan/radv_device.c       | 81 +++++++++++++++++++++++++++
>  src/amd/vulkan/radv_extensions.py  |  1 +
>  src/intel/vulkan/anv_device.c      | 89 ++++++++++++++++++++++++++++++
>  src/intel/vulkan/anv_extensions.py |  1 +
>  src/intel/vulkan/anv_gem.c         | 13 +++++
>  src/intel/vulkan/anv_gem_stubs.c   |  7 +++
>  src/intel/vulkan/anv_private.h     |  2 +
>  7 files changed, 194 insertions(+)
>
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 174922780fc..80050485e54 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -4955,3 +4955,84 @@ radv_GetDeviceGroupPeerMemoryFeatures(
>                                VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
>                                VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
>  }
> +
> +static const VkTimeDomainEXT radv_time_domains[] = {
> +       VK_TIME_DOMAIN_DEVICE_EXT,
> +       VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
> +       VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT,
> +};
> +
> +VkResult radv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
> +       VkPhysicalDevice                             physicalDevice,
> +       uint32_t                                     *pTimeDomainCount,
> +       VkTimeDomainEXT                              *pTimeDomains)
> +{
> +       int d;
> +       VK_OUTARRAY_MAKE(out, pTimeDomains, pTimeDomainCount);
> +
> +       for (d = 0; d < ARRAY_SIZE(radv_time_domains); d++) {
> +               vk_outarray_append(&out, i) {
> +                       *i = radv_time_domains[d];
> +               }
> +       }
> +
> +       return vk_outarray_status(&out);
> +}
> +
> +static uint64_t
> +radv_clock_gettime(clockid_t clock_id)
> +{
> +       struct timespec current;
> +       int ret;
> +
> +       ret = clock_gettime(clock_id, &current);
> +       if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW)
> +               ret = clock_gettime(CLOCK_MONOTONIC, &current);
> +       if (ret < 0)
> +               return 0;
> +
> +       return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec;
> +}
> +
> +VkResult radv_GetCalibratedTimestampsEXT(
> +       VkDevice                                     _device,
> +       uint32_t                                     timestampCount,
> +       const VkCalibratedTimestampInfoEXT           *pTimestampInfos,
> +       uint64_t                                     *pTimestamps,
> +       uint64_t                                     *pMaxDeviation)
> +{
> +       RADV_FROM_HANDLE(radv_device, device, _device);
> +       uint32_t clock_crystal_freq = device->physical_device->rad_info.clock_crystal_freq;
> +       int d;
> +       uint64_t begin, end;
> +
> +       begin = radv_clock_gettime(CLOCK_MONOTONIC_RAW);
> +
> +       for (d = 0; d < timestampCount; d++) {
> +               switch (pTimestampInfos[d].timeDomain) {
> +               case VK_TIME_DOMAIN_DEVICE_EXT:
> +                       pTimestamps[d] = device->ws->query_value(device->ws,
> +                                                                RADEON_TIMESTAMP);
> +                       break;
> +               case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
> +                       pTimestamps[d] = radv_clock_gettime(CLOCK_MONOTONIC);
> +                       break;
> +
> +               case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
> +                       pTimestamps[d] = begin;
> +                       break;
> +               default:
> +                       pTimestamps[d] = 0;
> +                       break;
> +               }
> +       }
> +
> +       end = radv_clock_gettime(CLOCK_MONOTONIC_RAW);
> +
> +       uint64_t clock_period = end - begin;
> +       uint64_t device_period = DIV_ROUND_UP(1000000, clock_crystal_freq);
> +
> +       *pMaxDeviation = MAX2(clock_period, device_period);

Should this not be a sum? Those deviations can happen independently
from each other, so worst case both deviations happen in the same
direction which causes the magnitude to be combined.

With that change:

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

> +
> +       return VK_SUCCESS;
> +}
> diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
> index 5dcedae1c63..4c81d3f0068 100644
> --- a/src/amd/vulkan/radv_extensions.py
> +++ b/src/amd/vulkan/radv_extensions.py
> @@ -92,6 +92,7 @@ EXTENSIONS = [
>      Extension('VK_KHR_display',                          23, 'VK_USE_PLATFORM_DISPLAY_KHR'),
>      Extension('VK_EXT_direct_mode_display',               1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
>      Extension('VK_EXT_acquire_xlib_display',              1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
> +    Extension('VK_EXT_calibrated_timestamps',             1, True),
>      Extension('VK_EXT_conditional_rendering',             1, True),
>      Extension('VK_EXT_conservative_rasterization',        1, 'device->rad_info.chip_class >= GFX9'),
>      Extension('VK_EXT_display_surface_counter',           1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
> index a2551452eb1..249d7f36f0f 100644
> --- a/src/intel/vulkan/anv_device.c
> +++ b/src/intel/vulkan/anv_device.c
> @@ -3021,6 +3021,95 @@ void anv_DestroyFramebuffer(
>     vk_free2(&device->alloc, pAllocator, fb);
>  }
>
> +static const VkTimeDomainEXT anv_time_domains[] = {
> +   VK_TIME_DOMAIN_DEVICE_EXT,
> +   VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
> +   VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT,
> +};
> +
> +VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
> +   VkPhysicalDevice                             physicalDevice,
> +   uint32_t                                     *pTimeDomainCount,
> +   VkTimeDomainEXT                              *pTimeDomains)
> +{
> +   int d;
> +   VK_OUTARRAY_MAKE(out, pTimeDomains, pTimeDomainCount);
> +
> +   for (d = 0; d < ARRAY_SIZE(anv_time_domains); d++) {
> +      vk_outarray_append(&out, i) {
> +         *i = anv_time_domains[d];
> +      }
> +   }
> +
> +   return vk_outarray_status(&out);
> +}
> +
> +static uint64_t
> +anv_clock_gettime(clockid_t clock_id)
> +{
> +   struct timespec current;
> +   int ret;
> +
> +   ret = clock_gettime(clock_id, &current);
> +   if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW)
> +      ret = clock_gettime(CLOCK_MONOTONIC, &current);
> +   if (ret < 0)
> +      return 0;
> +
> +   return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec;
> +}
> +
> +#define TIMESTAMP 0x2358
> +
> +VkResult anv_GetCalibratedTimestampsEXT(
> +   VkDevice                                     _device,
> +   uint32_t                                     timestampCount,
> +   const VkCalibratedTimestampInfoEXT           *pTimestampInfos,
> +   uint64_t                                     *pTimestamps,
> +   uint64_t                                     *pMaxDeviation)
> +{
> +   ANV_FROM_HANDLE(anv_device, device, _device);
> +   uint64_t timestamp_frequency = device->info.timestamp_frequency;
> +   int  ret;
> +   int d;
> +   uint64_t begin, end;
> +
> +   begin = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
> +
> +   for (d = 0; d < timestampCount; d++) {
> +      switch (pTimestampInfos[d].timeDomain) {
> +      case VK_TIME_DOMAIN_DEVICE_EXT:
> +         ret = anv_gem_reg_read(device, TIMESTAMP | 1,
> +                                &pTimestamps[d]);
> +
> +         if (ret != 0) {
> +            device->lost = TRUE;
> +            return VK_ERROR_DEVICE_LOST;
> +         }
> +         break;
> +      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
> +         pTimestamps[d] = anv_clock_gettime(CLOCK_MONOTONIC);
> +         break;
> +
> +      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
> +         pTimestamps[d] = begin;
> +         break;
> +      default:
> +         pTimestamps[d] = 0;
> +         break;
> +      }
> +   }
> +
> +   end = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
> +
> +   uint64_t clock_period = end - begin;
> +   uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
> +
> +   *pMaxDeviation = MAX2(clock_period, device_period);
> +
> +   return VK_SUCCESS;
> +}
> +
>  /* vk_icd.h does not declare this function, so we declare it here to
>   * suppress Wmissing-prototypes.
>   */
> diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
> index d4915c95013..a8535964da7 100644
> --- a/src/intel/vulkan/anv_extensions.py
> +++ b/src/intel/vulkan/anv_extensions.py
> @@ -126,6 +126,7 @@ EXTENSIONS = [
>      Extension('VK_EXT_vertex_attribute_divisor',          3, True),
>      Extension('VK_EXT_post_depth_coverage',               1, 'device->info.gen >= 9'),
>      Extension('VK_EXT_sampler_filter_minmax',             1, 'device->info.gen >= 9'),
> +    Extension('VK_EXT_calibrated_timestamps',             1, True),
>  ]
>
>  class VkVersion:
> diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c
> index c43b5ef9e06..1bdf040c1a3 100644
> --- a/src/intel/vulkan/anv_gem.c
> +++ b/src/intel/vulkan/anv_gem.c
> @@ -423,6 +423,19 @@ anv_gem_fd_to_handle(struct anv_device *device, int fd)
>     return args.handle;
>  }
>
> +int
> +anv_gem_reg_read(struct anv_device *device, uint32_t offset, uint64_t *result)
> +{
> +   struct drm_i915_reg_read args = {
> +      .offset = offset
> +   };
> +
> +   int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_REG_READ, &args);
> +
> +   *result = args.val;
> +   return ret;
> +}
> +
>  #ifndef SYNC_IOC_MAGIC
>  /* duplicated from linux/sync_file.h to avoid build-time dependency
>   * on new (v4.7) kernel headers.  Once distro's are mostly using
> diff --git a/src/intel/vulkan/anv_gem_stubs.c b/src/intel/vulkan/anv_gem_stubs.c
> index 5093bd5db1a..8cc3ad1f22e 100644
> --- a/src/intel/vulkan/anv_gem_stubs.c
> +++ b/src/intel/vulkan/anv_gem_stubs.c
> @@ -251,3 +251,10 @@ anv_gem_syncobj_wait(struct anv_device *device,
>  {
>     unreachable("Unused");
>  }
> +
> +int
> +anv_gem_reg_read(struct anv_device *device,
> +                 uint32_t offset, uint64_t *result)
> +{
> +   unreachable("Unused");
> +}
> diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
> index 599b903f25c..08376b00c8e 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1103,6 +1103,8 @@ int anv_gem_get_aperture(int fd, uint64_t *size);
>  int anv_gem_gpu_get_reset_stats(struct anv_device *device,
>                                  uint32_t *active, uint32_t *pending);
>  int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
> +int anv_gem_reg_read(struct anv_device *device,
> +                     uint32_t offset, uint64_t *result);
>  uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
>  int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
>  int anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
> --
> 2.19.1
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Keith Packard Oct. 16, 2018, 7:34 p.m. UTC | #3
Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> writes:

>> +       end = radv_clock_gettime(CLOCK_MONOTONIC_RAW);
>> +
>> +       uint64_t clock_period = end - begin;
>> +       uint64_t device_period = DIV_ROUND_UP(1000000, clock_crystal_freq);
>> +
>> +       *pMaxDeviation = MAX2(clock_period, device_period);
>
> Should this not be a sum? Those deviations can happen independently
> from each other, so worst case both deviations happen in the same
> direction which causes the magnitude to be combined.

This use of MAX2 comes right from one of the issues raised during work
on the extension:

 8) Can the maximum deviation reported ever be zero?

 RESOLVED: Unless the tick of each clock corresponding to the set of
 time domains coincides and all clocks can literally be sampled
 simutaneously, there isn’t really a possibility for the maximum
 deviation to be zero, so by convention the maximum deviation is always
 at least the maximum of the length of the ticks of the set of time
 domains calibrated and thus can never be zero.

I can't wrap my brain around this entirely, but I think that this says
that the deviation reported is supposed to only reflect the fact that we
aren't sampling the clocks at the same time, and so there may be a
'tick' of error for any sampled clock.

If you look at the previous issue in the spec, that actually has the
pseudo code I used in this implementation for computing maxDeviation
which doesn't include anything about the time period of the GPU.

Jason suggested using the GPU period as the minimum value for
maxDeviation at the GPU time period to make sure we never accidentally
returned zero, as that is forbidden by the spec. We might be able to use
1 instead, but it won't matter in practice as the time it takes to
actually sample all of the clocks is far longer than a GPU tick.
Jason Ekstrand Oct. 16, 2018, 7:44 p.m. UTC | #4
On Tue, Oct 16, 2018 at 2:35 PM Keith Packard <keithp@keithp.com> wrote:

> Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> writes:
>
> >> +       end = radv_clock_gettime(CLOCK_MONOTONIC_RAW);
> >> +
> >> +       uint64_t clock_period = end - begin;
> >> +       uint64_t device_period = DIV_ROUND_UP(1000000,
> clock_crystal_freq);
> >> +
> >> +       *pMaxDeviation = MAX2(clock_period, device_period);
> >
> > Should this not be a sum? Those deviations can happen independently
> > from each other, so worst case both deviations happen in the same
> > direction which causes the magnitude to be combined.
>
> This use of MAX2 comes right from one of the issues raised during work
> on the extension:
>
>  8) Can the maximum deviation reported ever be zero?
>
>  RESOLVED: Unless the tick of each clock corresponding to the set of
>  time domains coincides and all clocks can literally be sampled
>  simutaneously, there isn’t really a possibility for the maximum
>  deviation to be zero, so by convention the maximum deviation is always
>  at least the maximum of the length of the ticks of the set of time
>  domains calibrated and thus can never be zero.
>
> I can't wrap my brain around this entirely, but I think that this says
> that the deviation reported is supposed to only reflect the fact that we
> aren't sampling the clocks at the same time, and so there may be a
> 'tick' of error for any sampled clock.
>
> If you look at the previous issue in the spec, that actually has the
> pseudo code I used in this implementation for computing maxDeviation
> which doesn't include anything about the time period of the GPU.
>
> Jason suggested using the GPU period as the minimum value for
> maxDeviation at the GPU time period to make sure we never accidentally
> returned zero, as that is forbidden by the spec. We might be able to use
> 1 instead, but it won't matter in practice as the time it takes to
> actually sample all of the clocks is far longer than a GPU tick.
>

I think what Bas is getting at is that there are two problems:

 1) We are not sampling at exactly the same time
 2) The two clocks may not tick at exactly the same time.

Even if I can simultaneously sample the CPU and GPU clocks, their
oscilators are not aligned and I my sample may be at the begining of the
CPU tick and the end of the GPU tick.  If I had sampled 75ns earlier, I
could have gotten lower CPU time but the same GPU time (most intel GPUs
have about an 80ns tick).

If we want to be conservative, I suspect Bas may be right that adding is
the safer thing to do.

--Jason
<div dir="ltr"><div class="gmail_quote"><div dir="ltr">On Tue, Oct 16, 2018 at 2:35 PM Keith Packard &lt;<a href="mailto:keithp@keithp.com">keithp@keithp.com</a>&gt; wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Bas Nieuwenhuizen &lt;<a href="mailto:bas@basnieuwenhuizen.nl" target="_blank">bas@basnieuwenhuizen.nl</a>&gt; writes:<br>
<br>
&gt;&gt; +       end = radv_clock_gettime(CLOCK_MONOTONIC_RAW);<br>
&gt;&gt; +<br>
&gt;&gt; +       uint64_t clock_period = end - begin;<br>
&gt;&gt; +       uint64_t device_period = DIV_ROUND_UP(1000000, clock_crystal_freq);<br>
&gt;&gt; +<br>
&gt;&gt; +       *pMaxDeviation = MAX2(clock_period, device_period);<br>
&gt;<br>
&gt; Should this not be a sum? Those deviations can happen independently<br>
&gt; from each other, so worst case both deviations happen in the same<br>
&gt; direction which causes the magnitude to be combined.<br>
<br>
This use of MAX2 comes right from one of the issues raised during work<br>
on the extension:<br>
<br>
 8) Can the maximum deviation reported ever be zero?<br>
<br>
 RESOLVED: Unless the tick of each clock corresponding to the set of<br>
 time domains coincides and all clocks can literally be sampled<br>
 simutaneously, there isn’t really a possibility for the maximum<br>
 deviation to be zero, so by convention the maximum deviation is always<br>
 at least the maximum of the length of the ticks of the set of time<br>
 domains calibrated and thus can never be zero.<br>
<br>
I can&#39;t wrap my brain around this entirely, but I think that this says<br>
that the deviation reported is supposed to only reflect the fact that we<br>
aren&#39;t sampling the clocks at the same time, and so there may be a<br>
&#39;tick&#39; of error for any sampled clock.<br>
<br>
If you look at the previous issue in the spec, that actually has the<br>
pseudo code I used in this implementation for computing maxDeviation<br>
which doesn&#39;t include anything about the time period of the GPU.<br>
<br>
Jason suggested using the GPU period as the minimum value for<br>
maxDeviation at the GPU time period to make sure we never accidentally<br>
returned zero, as that is forbidden by the spec. We might be able to use<br>
1 instead, but it won&#39;t matter in practice as the time it takes to<br>
actually sample all of the clocks is far longer than a GPU tick.<br></blockquote><div><br></div><div>I think what Bas is getting at is that there are two problems:</div><div><br></div><div> 1) We are not sampling at exactly the same time</div><div> 2) The two clocks may not tick at exactly the same time.</div><div><br></div><div>Even if I can simultaneously sample the CPU and GPU clocks, their oscilators are not aligned and I my sample may be at the begining of the CPU tick and the end of the GPU tick.  If I had sampled 75ns earlier, I could have gotten lower CPU time but the same GPU time (most intel GPUs have about an 80ns tick).</div><div><br></div><div>If we want to be conservative, I suspect Bas may be right that adding is the safer thing to do.</div><div><br></div><div>--Jason<br></div></div></div>
Keith Packard Oct. 16, 2018, 9:07 p.m. UTC | #5
Jason Ekstrand <jason@jlekstrand.net> writes:

> I think what Bas is getting at is that there are two problems:
>
>  1) We are not sampling at exactly the same time
>  2) The two clocks may not tick at exactly the same time.

Thanks for the clarification.

> If we want to be conservative, I suspect Bas may be right that adding is
> the safer thing to do.

Yes, it's certainly safe to increase the value of
maxDeviation. Currently, the time it takes to sample all of the clocks
is far larger than the GPU tick, so adding that in would not have a huge
impact on the value returned to the application.

I'd like to dig in a little further and actually understand if the
current computation (which is derived directly from the Vulkan spec) is
wrong, and if so, whether the spec needs to be adjusted.

I think the question is what 'maxDeviation' is supposed to
represent. All the spec says is:

 * pMaxDeviation is a pointer to a 64-bit unsigned integer value in
   which the strictly positive maximum deviation, in nanoseconds, of the
   calibrated timestamp values is returned.

I interpret this as the maximum error in sampling the individual clocks,
which is to say that the clock values are guaranteed to have been
sampled within this interval of each other.

So, if we have a monotonic clock and GPU clock:

          0 1 2 3 4 5 6 7 8 9 a b c d e f
Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-

          0         1         2         3
GPU       -----_____-----_____-----_____-----_____


gpu_period in this case is 5 ticks of the monotonic clock.

Now, I perform three operations:

        start = read(monotonic)
        gpu   = read(GPU)
        end   = read(monotonic)

Let's say that:

        start = 2
        GPU = 1 * 5 = 5 monotonic equivalent ticks
        end = b

So, the question is only how large the error between GPU and start could
possibly be. Certainly the GPU clock was sampled some time between
when monotonic tick 2 started and monotonic tick b ended. But, we have
no idea what phase the GPU clock was in when sampled.

Let's imagine we manage to sample the GPU clock immediately after the
first monotonic sample. I'll shift the offset of the monotonic and GPU
clocks to retain the same values (start = 2, GPU = 1), but now
the GPU clock is being sampled immediately after monotonic time 2:

                w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
Monotonic       -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-

          0         1         2         3
GPU       -----_____-----_____-----_____-----_____


In this case, the GPU tick started at monotonic time y, nearly 5
monotonic ticks earlier than the measured monotonic time, so the
deviation between GPU and monotonic would be 5 ticks.

If we sample the GPU clock immediately before the second monotonic
sample, then that GPU tick either starts earlier than the range, in
which case the above evaluation holds, or the GPU tick is entirely
contained within the range:

          0 1 2 3 4 5 6 7 8 9 a b c d e f
Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-

           z         0         1         2         3
GPU      __-----_____-----_____-----_____-----_____-----

In this case, the deviation between the first monotonic sample (that
returned to the application as the monotonic time) and the GPU sample is
the whole interval of measurement (b - 2).

I think I've just managed to convince myself that Jason's first
suggestion (max2(sample interval, gpu interval)) is correct, although I
think we should add '1' to the interval to account for sampling phase
errors in the monotonic clock. As that's measured in ns, and I'm
currently getting values in the µs range, that's a small error in
comparison.
Jason Ekstrand Oct. 16, 2018, 9:33 p.m. UTC | #6
On Tue, Oct 16, 2018 at 4:07 PM Keith Packard <keithp@keithp.com> wrote:

> Jason Ekstrand <jason@jlekstrand.net> writes:
>
> > I think what Bas is getting at is that there are two problems:
> >
> >  1) We are not sampling at exactly the same time
> >  2) The two clocks may not tick at exactly the same time.
>
> Thanks for the clarification.
>
> > If we want to be conservative, I suspect Bas may be right that adding is
> > the safer thing to do.
>
> Yes, it's certainly safe to increase the value of
> maxDeviation. Currently, the time it takes to sample all of the clocks
> is far larger than the GPU tick, so adding that in would not have a huge
> impact on the value returned to the application.
>
> I'd like to dig in a little further and actually understand if the
> current computation (which is derived directly from the Vulkan spec) is
> wrong, and if so, whether the spec needs to be adjusted.
>
> I think the question is what 'maxDeviation' is supposed to
> represent. All the spec says is:
>
>  * pMaxDeviation is a pointer to a 64-bit unsigned integer value in
>    which the strictly positive maximum deviation, in nanoseconds, of the
>    calibrated timestamp values is returned.
>
> I interpret this as the maximum error in sampling the individual clocks,
> which is to say that the clock values are guaranteed to have been
> sampled within this interval of each other.
>
> So, if we have a monotonic clock and GPU clock:
>
>           0 1 2 3 4 5 6 7 8 9 a b c d e f
> Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
>           0         1         2         3
> GPU       -----_____-----_____-----_____-----_____
>
>
> gpu_period in this case is 5 ticks of the monotonic clock.
>
> Now, I perform three operations:
>
>         start = read(monotonic)
>         gpu   = read(GPU)
>         end   = read(monotonic)
>
> Let's say that:
>
>         start = 2
>         GPU = 1 * 5 = 5 monotonic equivalent ticks
>         end = b
>
> So, the question is only how large the error between GPU and start could
> possibly be. Certainly the GPU clock was sampled some time between
> when monotonic tick 2 started and monotonic tick b ended. But, we have
> no idea what phase the GPU clock was in when sampled.
>
> Let's imagine we manage to sample the GPU clock immediately after the
> first monotonic sample. I'll shift the offset of the monotonic and GPU
> clocks to retain the same values (start = 2, GPU = 1), but now
> the GPU clock is being sampled immediately after monotonic time 2:
>
>                 w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
> Monotonic       -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
>           0         1         2         3
> GPU       -----_____-----_____-----_____-----_____
>
>
> In this case, the GPU tick started at monotonic time y, nearly 5
> monotonic ticks earlier than the measured monotonic time, so the
> deviation between GPU and monotonic would be 5 ticks.
>
> If we sample the GPU clock immediately before the second monotonic
> sample, then that GPU tick either starts earlier than the range, in
> which case the above evaluation holds, or the GPU tick is entirely
> contained within the range:
>
>           0 1 2 3 4 5 6 7 8 9 a b c d e f
> Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
>            z         0         1         2         3
> GPU      __-----_____-----_____-----_____-----_____-----
>
> In this case, the deviation between the first monotonic sample (that
> returned to the application as the monotonic time) and the GPU sample is
> the whole interval of measurement (b - 2).
>
> I think I've just managed to convince myself that Jason's first
> suggestion (max2(sample interval, gpu interval)) is correct, although I
> think we should add '1' to the interval to account for sampling phase
> errors in the monotonic clock. As that's measured in ns, and I'm
> currently getting values in the µs range, that's a small error in
> comparison.
>

You've got me almost convinced as well.  Thanks for the diagrams!  I think
instead of adding 1 perhaps what we want is

max2(sample_interval_ns, gpu_tick_ns + monotonic_tick_ns)

Where monotonic_tick_ns is maybe as low as 1.  Am I following you correctly?

--Jason
<div dir="ltr"><div class="gmail_quote"><div dir="ltr">On Tue, Oct 16, 2018 at 4:07 PM Keith Packard &lt;<a href="mailto:keithp@keithp.com">keithp@keithp.com</a>&gt; wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Jason Ekstrand &lt;<a href="mailto:jason@jlekstrand.net" target="_blank">jason@jlekstrand.net</a>&gt; writes:<br>
<br>
&gt; I think what Bas is getting at is that there are two problems:<br>
&gt;<br>
&gt;  1) We are not sampling at exactly the same time<br>
&gt;  2) The two clocks may not tick at exactly the same time.<br>
<br>
Thanks for the clarification.<br>
<br>
&gt; If we want to be conservative, I suspect Bas may be right that adding is<br>
&gt; the safer thing to do.<br>
<br>
Yes, it&#39;s certainly safe to increase the value of<br>
maxDeviation. Currently, the time it takes to sample all of the clocks<br>
is far larger than the GPU tick, so adding that in would not have a huge<br>
impact on the value returned to the application.<br>
<br>
I&#39;d like to dig in a little further and actually understand if the<br>
current computation (which is derived directly from the Vulkan spec) is<br>
wrong, and if so, whether the spec needs to be adjusted.<br>
<br>
I think the question is what &#39;maxDeviation&#39; is supposed to<br>
represent. All the spec says is:<br>
<br>
 * pMaxDeviation is a pointer to a 64-bit unsigned integer value in<br>
   which the strictly positive maximum deviation, in nanoseconds, of the<br>
   calibrated timestamp values is returned.<br>
<br>
I interpret this as the maximum error in sampling the individual clocks,<br>
which is to say that the clock values are guaranteed to have been<br>
sampled within this interval of each other.<br>
<br>
So, if we have a monotonic clock and GPU clock:<br>
<br>
          0 1 2 3 4 5 6 7 8 9 a b c d e f<br>
Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-<br>
<br>
          0         1         2         3<br>
GPU       -----_____-----_____-----_____-----_____<br>
<br>
<br>
gpu_period in this case is 5 ticks of the monotonic clock.<br>
<br>
Now, I perform three operations:<br>
<br>
        start = read(monotonic)<br>
        gpu   = read(GPU)<br>
        end   = read(monotonic)<br>
<br>
Let&#39;s say that:<br>
<br>
        start = 2<br>
        GPU = 1 * 5 = 5 monotonic equivalent ticks<br>
        end = b<br>
<br>
So, the question is only how large the error between GPU and start could<br>
possibly be. Certainly the GPU clock was sampled some time between<br>
when monotonic tick 2 started and monotonic tick b ended. But, we have<br>
no idea what phase the GPU clock was in when sampled.<br>
<br>
Let&#39;s imagine we manage to sample the GPU clock immediately after the<br>
first monotonic sample. I&#39;ll shift the offset of the monotonic and GPU<br>
clocks to retain the same values (start = 2, GPU = 1), but now<br>
the GPU clock is being sampled immediately after monotonic time 2:<br>
<br>
                w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f<br>
Monotonic       -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-<br>
<br>
          0         1         2         3<br>
GPU       -----_____-----_____-----_____-----_____<br>
<br>
<br>
In this case, the GPU tick started at monotonic time y, nearly 5<br>
monotonic ticks earlier than the measured monotonic time, so the<br>
deviation between GPU and monotonic would be 5 ticks.<br>
<br>
If we sample the GPU clock immediately before the second monotonic<br>
sample, then that GPU tick either starts earlier than the range, in<br>
which case the above evaluation holds, or the GPU tick is entirely<br>
contained within the range:<br>
<br>
          0 1 2 3 4 5 6 7 8 9 a b c d e f<br>
Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-<br>
<br>
           z         0         1         2         3<br>
GPU      __-----_____-----_____-----_____-----_____-----<br>
<br>
In this case, the deviation between the first monotonic sample (that<br>
returned to the application as the monotonic time) and the GPU sample is<br>
the whole interval of measurement (b - 2).<br>
<br>
I think I&#39;ve just managed to convince myself that Jason&#39;s first<br>
suggestion (max2(sample interval, gpu interval)) is correct, although I<br>
think we should add &#39;1&#39; to the interval to account for sampling phase<br>
errors in the monotonic clock. As that&#39;s measured in ns, and I&#39;m<br>
currently getting values in the µs range, that&#39;s a small error in<br>
comparison.<br></blockquote><div><br></div><div>You&#39;ve got me almost convinced as well.  Thanks for the diagrams!  I think instead of adding 1 perhaps what we want is</div><div><br></div><div>max2(sample_interval_ns, gpu_tick_ns + monotonic_tick_ns)</div><div><br></div><div>Where monotonic_tick_ns is maybe as low as 1.  Am I following you correctly?<br></div><div><br></div><div>--Jason<br></div></div></div>
Bas Nieuwenhuizen Oct. 16, 2018, 9:38 p.m. UTC | #7
On Tue, Oct 16, 2018 at 11:07 PM Keith Packard <keithp@keithp.com> wrote:
>
> Jason Ekstrand <jason@jlekstrand.net> writes:
>
> > I think what Bas is getting at is that there are two problems:
> >
> >  1) We are not sampling at exactly the same time
> >  2) The two clocks may not tick at exactly the same time.
>
> Thanks for the clarification.
>
> > If we want to be conservative, I suspect Bas may be right that adding is
> > the safer thing to do.
>
> Yes, it's certainly safe to increase the value of
> maxDeviation. Currently, the time it takes to sample all of the clocks
> is far larger than the GPU tick, so adding that in would not have a huge
> impact on the value returned to the application.
>
> I'd like to dig in a little further and actually understand if the
> current computation (which is derived directly from the Vulkan spec) is
> wrong, and if so, whether the spec needs to be adjusted.
>
> I think the question is what 'maxDeviation' is supposed to
> represent. All the spec says is:
>
>  * pMaxDeviation is a pointer to a 64-bit unsigned integer value in
>    which the strictly positive maximum deviation, in nanoseconds, of the
>    calibrated timestamp values is returned.
>
> I interpret this as the maximum error in sampling the individual clocks,
> which is to say that the clock values are guaranteed to have been
> sampled within this interval of each other.

With this interpretation I don't think you need to account for period at all?

>
> So, if we have a monotonic clock and GPU clock:
>
>           0 1 2 3 4 5 6 7 8 9 a b c d e f
> Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
>           0         1         2         3
> GPU       -----_____-----_____-----_____-----_____
>
>
> gpu_period in this case is 5 ticks of the monotonic clock.
>
> Now, I perform three operations:
>
>         start = read(monotonic)
>         gpu   = read(GPU)
>         end   = read(monotonic)
>
> Let's say that:
>
>         start = 2
>         GPU = 1 * 5 = 5 monotonic equivalent ticks
>         end = b
>
> So, the question is only how large the error between GPU and start could
> possibly be. Certainly the GPU clock was sampled some time between
> when monotonic tick 2 started and monotonic tick b ended. But, we have
> no idea what phase the GPU clock was in when sampled.
>
> Let's imagine we manage to sample the GPU clock immediately after the
> first monotonic sample. I'll shift the offset of the monotonic and GPU
> clocks to retain the same values (start = 2, GPU = 1), but now
> the GPU clock is being sampled immediately after monotonic time 2:
>
>                 w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
> Monotonic       -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
>           0         1         2         3
> GPU       -----_____-----_____-----_____-----_____
>
>
> In this case, the GPU tick started at monotonic time y, nearly 5
> monotonic ticks earlier than the measured monotonic time, so the
> deviation between GPU and monotonic would be 5 ticks.

Well the complication here is that in the MONOTONIC (not
MONOTONIC_RAW) case the CPU measurement can happen at the end of the
MONOTONIC_RAW interval (as the order of measurements is based on
argument order), so you can get a tick that started `period` (5 in
this case) monotonic ticks before the start of the interval and a CPU
measurement at the end of the interval.

>
> If we sample the GPU clock immediately before the second monotonic
> sample, then that GPU tick either starts earlier than the range, in
> which case the above evaluation holds, or the GPU tick is entirely
> contained within the range:
>
>           0 1 2 3 4 5 6 7 8 9 a b c d e f
> Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
>            z         0         1         2         3
> GPU      __-----_____-----_____-----_____-----_____-----
>
> In this case, the deviation between the first monotonic sample (that
> returned to the application as the monotonic time) and the GPU sample is
> the whole interval of measurement (b - 2).
>
> I think I've just managed to convince myself that Jason's first
> suggestion (max2(sample interval, gpu interval)) is correct, although I
> think we should add '1' to the interval to account for sampling phase
> errors in the monotonic clock. As that's measured in ns, and I'm
> currently getting values in the µs range, that's a small error in
> comparison.
>
> --
> -keith
Keith Packard Oct. 16, 2018, 9:51 p.m. UTC | #8
Jason Ekstrand <jason@jlekstrand.net> writes:


> You've got me almost convinced as well.  Thanks for the diagrams!  I think
> instead of adding 1 perhaps what we want is
>
> max2(sample_interval_ns, gpu_tick_ns + monotonic_tick_ns)
>
> Where monotonic_tick_ns is maybe as low as 1.  Am I following you correctly?

Not quite; I was thinking that because the sample_interval_ns is
measured by sampling the monotonic clock twice, the actual interval can
be up to 1 tick longer, so

max2(sample_interval_ns + monotonic_tick_ns, gpu_tick_ns)

The gpu_tick_ns is computed, not measured, and so accurately reflects
the maximum deviation in the measurements.
Keith Packard Oct. 16, 2018, 10:06 p.m. UTC | #9
Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> writes:

> Well the complication here is that in the MONOTONIC (not
> MONOTONIC_RAW) case the CPU measurement can happen at the end of the
> MONOTONIC_RAW interval (as the order of measurements is based on
> argument order), so you can get a tick that started `period` (5 in
> this case) monotonic ticks before the start of the interval and a CPU
> measurement at the end of the interval.

Ah, that's an excellent point. Let's split out raw and monotonic and
take a look. You want the GPU sampled at the start of the raw interval
and monotonic sampled at the end, I think?

                 w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
Raw              -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-

          0         1         2         3
GPU       -----_____-----_____-----_____-----_____

                                    x y z 0 1 2 3 4 5 6 7 8 9 a b c
Monotonic                           -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-

Interval                     <----------------->
Deviation           <-------------------------->

        start = read(raw)       2
        gpu   = read(GPU)       1
        mono  = read(monotonic) 2
        end   = read(raw)       b

In this case, the error between the monotonic pulse and the GPU is
interval + gpu_period (probably plus one to include the measurement
error of the raw clock).

Thanks for finding this case.

Now, I guess the question is whether we want to try and find the
smallest maxDeviation possible for each query. For instance, if the
application asks only for raw and gpu, the max_deviation could be
max2(interval+1,gpu_period), but if it asks for monotonic and gpu, it
would be interval+1+gpu_period. I'm not seeing a simple definition
here...
Bas Nieuwenhuizen Oct. 16, 2018, 10:28 p.m. UTC | #10
On Wed, Oct 17, 2018 at 12:06 AM Keith Packard <keithp@keithp.com> wrote:
>
> Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> writes:
>
> > Well the complication here is that in the MONOTONIC (not
> > MONOTONIC_RAW) case the CPU measurement can happen at the end of the
> > MONOTONIC_RAW interval (as the order of measurements is based on
> > argument order), so you can get a tick that started `period` (5 in
> > this case) monotonic ticks before the start of the interval and a CPU
> > measurement at the end of the interval.
>
> Ah, that's an excellent point. Let's split out raw and monotonic and
> take a look. You want the GPU sampled at the start of the raw interval
> and monotonic sampled at the end, I think?
>
>                  w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
> Raw              -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
>           0         1         2         3
> GPU       -----_____-----_____-----_____-----_____
>
>                                     x y z 0 1 2 3 4 5 6 7 8 9 a b c
> Monotonic                           -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
> Interval                     <----------------->
> Deviation           <-------------------------->
>
>         start = read(raw)       2
>         gpu   = read(GPU)       1
>         mono  = read(monotonic) 2
>         end   = read(raw)       b
>
> In this case, the error between the monotonic pulse and the GPU is
> interval + gpu_period (probably plus one to include the measurement
> error of the raw clock).
>
> Thanks for finding this case.
>
> Now, I guess the question is whether we want to try and find the
> smallest maxDeviation possible for each query. For instance, if the
> application asks only for raw and gpu, the max_deviation could be
> max2(interval+1,gpu_period), but if it asks for monotonic and gpu, it
> would be interval+1+gpu_period. I'm not seeing a simple definition
> here...

You can make the monotonic case the same as the raw case if you make
sure to always sample the CPU first by e.g. splitting the loops into
two and doing CPU in the first and GPU in the second. That way you
make the case above impossible.

That said "start of the interval of the tick" is kinda arbitrary and
you could pick random other points on that interval, so depending on
what requirements you put on it (i.e. can the chosen position be
different per call, consistent but implicit or explicitly picked which
might be leaked through the interface) the max deviation changes. So
depending on interpretation this thing can be very moot ...


>
> --
> -keith
Keith Packard Oct. 16, 2018, 10:56 p.m. UTC | #11
Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> writes:

> You can make the monotonic case the same as the raw case if you make
> sure to always sample the CPU first by e.g. splitting the loops into
> two and doing CPU in the first and GPU in the second. That way you
> make the case above impossible.

Right, I had thought of that, and it probably solves the problem for
us. If more time domains are added, things become 'more complicated'
though.

> That said "start of the interval of the tick" is kinda arbitrary and
> you could pick random other points on that interval, so depending on
> what requirements you put on it (i.e. can the chosen position be
> different per call, consistent but implicit or explicitly picked which
> might be leaked through the interface) the max deviation changes. So
> depending on interpretation this thing can be very moot ...

It doesn't really matter what phase you use; the timer increments
periodically, and what really matters is the time when that happens
relative to other clocks changing.
Jason Ekstrand Oct. 17, 2018, 1:55 a.m. UTC | #12
On Tue, Oct 16, 2018 at 5:06 PM Keith Packard <keithp@keithp.com> wrote:

> Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> writes:
>
> > Well the complication here is that in the MONOTONIC (not
> > MONOTONIC_RAW) case the CPU measurement can happen at the end of the
> > MONOTONIC_RAW interval (as the order of measurements is based on
> > argument order), so you can get a tick that started `period` (5 in
> > this case) monotonic ticks before the start of the interval and a CPU
> > measurement at the end of the interval.
>
> Ah, that's an excellent point. Let's split out raw and monotonic and
> take a look. You want the GPU sampled at the start of the raw interval
> and monotonic sampled at the end, I think?
>
>                  w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
> Raw              -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
>           0         1         2         3
> GPU       -----_____-----_____-----_____-----_____
>
>                                     x y z 0 1 2 3 4 5 6 7 8 9 a b c
> Monotonic                           -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
>
> Interval                     <----------------->
> Deviation           <-------------------------->
>
>         start = read(raw)       2
>         gpu   = read(GPU)       1
>         mono  = read(monotonic) 2
>         end   = read(raw)       b
>
> In this case, the error between the monotonic pulse and the GPU is
> interval + gpu_period (probably plus one to include the measurement
> error of the raw clock).
>

I'm very confused by this case.  Why is monotonic timeline delayed?  It
seems to me like it's only the monotonic sampling that's delayed and the
result is that mono ends up closer to end than start so the sampled value
would be something like 9 or a rather than 2?

I think we can model this fairly simply as two components:

 1) The size of the sampling window; this is "end - start +
monotonic_raw_tick"
 2) The maximum phase shift of any sample.  The only issue here is that a
tick may have started before the sampling window so we need to add on the
maximum tick size.  The worst case bound for this is when the early sampled
clock is sampled at the end of a tick and the late sampled clock is sampled
at the beginning of a tick.

The result is that we're looking at something like "end - start +
monotonic_raw_tick + max(gpu_tick, monotonic_tick)"  Have I just come
full-circle?
<div dir="ltr"><div class="gmail_quote"><div dir="ltr">On Tue, Oct 16, 2018 at 5:06 PM Keith Packard &lt;<a href="mailto:keithp@keithp.com">keithp@keithp.com</a>&gt; wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Bas Nieuwenhuizen &lt;<a href="mailto:bas@basnieuwenhuizen.nl" target="_blank">bas@basnieuwenhuizen.nl</a>&gt; writes:<br>
<br>
&gt; Well the complication here is that in the MONOTONIC (not<br>
&gt; MONOTONIC_RAW) case the CPU measurement can happen at the end of the<br>
&gt; MONOTONIC_RAW interval (as the order of measurements is based on<br>
&gt; argument order), so you can get a tick that started `period` (5 in<br>
&gt; this case) monotonic ticks before the start of the interval and a CPU<br>
&gt; measurement at the end of the interval.<br>
<br>
Ah, that&#39;s an excellent point. Let&#39;s split out raw and monotonic and<br>
take a look. You want the GPU sampled at the start of the raw interval<br>
and monotonic sampled at the end, I think?<br>
<br>
                 w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f<br>
Raw              -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-<br>
<br>
          0         1         2         3<br>
GPU       -----_____-----_____-----_____-----_____<br>
<br>
                                    x y z 0 1 2 3 4 5 6 7 8 9 a b c<br>
Monotonic                           -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-<br>
<br>
Interval                     &lt;-----------------&gt;<br>
Deviation           &lt;--------------------------&gt;<br>
<br>
        start = read(raw)       2<br>
        gpu   = read(GPU)       1<br>
        mono  = read(monotonic) 2<br>
        end   = read(raw)       b<br>
<br>
In this case, the error between the monotonic pulse and the GPU is<br>
interval + gpu_period (probably plus one to include the measurement<br>
error of the raw clock).<br></blockquote><div><br></div><div>I&#39;m very confused by this case.  Why is monotonic timeline delayed?  It seems to me like it&#39;s only the monotonic sampling that&#39;s delayed and the result is that mono ends up closer to end than start so the sampled value would be something like 9 or a rather than 2?</div><div><br></div><div>I think we can model this fairly simply as two components:</div><div><br></div><div> 1) The size of the sampling window; this is &quot;end - start + monotonic_raw_tick&quot;</div><div> 2) The maximum phase shift of any sample.  The only issue here is that a tick may have started before the sampling window so we need to add on the maximum tick size.  The worst case bound for this is when the early sampled clock is sampled at the end of a tick and the late sampled clock is sampled at the beginning of a tick.<br></div><div><br></div><div>The result is that we&#39;re looking at something like &quot;end - start + monotonic_raw_tick + max(gpu_tick, monotonic_tick)&quot;  Have I just come full-circle?<br></div></div></div>
Jason Ekstrand Oct. 17, 2018, 2:18 a.m. UTC | #13
On Tue, Oct 16, 2018 at 5:56 PM Keith Packard <keithp@keithp.com> wrote:

> Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> writes:
>
> > You can make the monotonic case the same as the raw case if you make
> > sure to always sample the CPU first by e.g. splitting the loops into
> > two and doing CPU in the first and GPU in the second. That way you
> > make the case above impossible.
>
> Right, I had thought of that, and it probably solves the problem for
> us. If more time domains are added, things become 'more complicated'
> though.
>

Doing all of the CPU sampling on one side or the other of the GPU sampling
would probably reduce our window.


> > That said "start of the interval of the tick" is kinda arbitrary and
> > you could pick random other points on that interval, so depending on
> > what requirements you put on it (i.e. can the chosen position be
> > different per call, consistent but implicit or explicitly picked which
> > might be leaked through the interface) the max deviation changes. So
> > depending on interpretation this thing can be very moot ...
>
> It doesn't really matter what phase you use; the timer increments
> periodically, and what really matters is the time when that happens
> relative to other clocks changing.
>

Agreed.

Thinking about this a bit more, I think it helps to consider each clock to
be a real number that's changing continuously in time and what you actually
measure is floor(x / P(x)) where P(x) is the period of x in nanoseconds..
(or ceil; it doesn't matter so long as you're consistent.)  At any given
point, the clocks do have an exact value relative to each other.  When you
sample, you grab floor(M / P(M)), floor(G / P(G)), and floor(R / P(R)) all
in some interval of size I.  The delta between the real values sampled is
most I but the sampling takes a floor operation, so the actual value of any
given clock C may be as much as P(C) greater than what was sampled but it
cannot be lower (assuming the floor convention).  This leaves us with a
delta of I + max(P(M), P(R), P(G)).  In particular, any two real-number
valued times are, instantaneously, within that interval.

The next question becomes, if I sample again and assume zero clock drift,
what are the bounds on the next sampling.  Above, we calculated the maximum
delta between real-valued clocks.  However, because we're sampling again,
we may end up with more phase shift issues and any clock may, again, be off
by as much as P(C).  However, again assuming no drift, no clock is going to
be off with respect to itself; just sampled at a different phase so I think
the most delta you can see between two clocks in the two samplings is the
sum of their periods.  So if the delta we're looking for is a delta for a
theoretical second sampling, I think it's I plus the maximum of the sums of
all pairs of periods.

Personally, I'm completely content to have the delta just be a the first
one: a bound on the difference between any two real-valued times.  At this
point, I can guarantee you that far more thought has been put into this
mesa-dev discussion than was put into the spec and I think we're rapidly
getting to the point of diminishing returns. :-)

--Jason
<div dir="ltr"><div class="gmail_quote"><div dir="ltr">On Tue, Oct 16, 2018 at 5:56 PM Keith Packard &lt;<a href="mailto:keithp@keithp.com">keithp@keithp.com</a>&gt; wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Bas Nieuwenhuizen &lt;<a href="mailto:bas@basnieuwenhuizen.nl" target="_blank">bas@basnieuwenhuizen.nl</a>&gt; writes:<br>
<br>
&gt; You can make the monotonic case the same as the raw case if you make<br>
&gt; sure to always sample the CPU first by e.g. splitting the loops into<br>
&gt; two and doing CPU in the first and GPU in the second. That way you<br>
&gt; make the case above impossible.<br>
<br>
Right, I had thought of that, and it probably solves the problem for<br>
us. If more time domains are added, things become &#39;more complicated&#39;<br>
though.<br></blockquote><div><br></div><div>Doing all of the CPU sampling on one side or the other of the GPU sampling would probably reduce our window.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
&gt; That said &quot;start of the interval of the tick&quot; is kinda arbitrary and<br>
&gt; you could pick random other points on that interval, so depending on<br>
&gt; what requirements you put on it (i.e. can the chosen position be<br>
&gt; different per call, consistent but implicit or explicitly picked which<br>
&gt; might be leaked through the interface) the max deviation changes. So<br>
&gt; depending on interpretation this thing can be very moot ...<br>
<br>
It doesn&#39;t really matter what phase you use; the timer increments<br>
periodically, and what really matters is the time when that happens<br>
relative to other clocks changing.<br></blockquote><div><br></div><div>Agreed.</div><div><br></div><div>Thinking about this a bit more, I think it helps to consider each clock to be a real number that&#39;s changing continuously in time and what you actually measure is floor(x / P(x)) where P(x) is the period of x in nanoseconds.. (or ceil; it doesn&#39;t matter so long as you&#39;re consistent.)  At any given point, the clocks do have an exact value relative to each other.  When you sample, you grab floor(M / P(M)), floor(G / P(G)), and floor(R / P(R)) all in some interval of size I.  The delta between the real values sampled is most I but the sampling takes a floor operation, so the actual value of any given clock C may be as much as P(C) greater than what was sampled but it cannot be lower (assuming the floor convention).  This leaves us with a delta of I + max(P(M), P(R), P(G)).  In particular, any two real-number valued times are, instantaneously, within that interval.</div><div><br></div><div>The next question becomes, if I sample again and assume zero clock drift, what are the bounds on the next sampling.  Above, we calculated the maximum delta between real-valued clocks.  However, because we&#39;re sampling again, we may end up with more phase shift issues and any clock may, again, be off by as much as P(C).  However, again assuming no drift, no clock is going to be off with respect to itself; just sampled at a different phase so I think the most delta you can see between two clocks in the two samplings is the sum of their periods.  So if the delta we&#39;re looking for is a delta for a theoretical second sampling, I think it&#39;s I plus the maximum of the sums of all pairs of periods.</div><div><br></div><div>Personally, I&#39;m completely content to have the delta just be a the first one: a bound on the difference between any two real-valued times.  At this point, I can guarantee you that far more thought has been put into this mesa-dev discussion than was put into the spec and I think we&#39;re rapidly getting to the point of diminishing returns. :-)<br></div><div><br></div><div>--Jason<br></div></div></div>
Keith Packard Oct. 17, 2018, 5:01 a.m. UTC | #14
Jason Ekstrand <jason@jlekstrand.net> writes:


> The result is that we're looking at something like "end - start +
> monotonic_raw_tick + max(gpu_tick, monotonic_tick)"  Have I just come
> full-circle?

Yes.  As monotonic_raw_tick and monotonic_tick are both 1...
Keith Packard Oct. 17, 2018, 5:14 a.m. UTC | #15
Jason Ekstrand <jason@jlekstrand.net> writes:

> Doing all of the CPU sampling on one side or the other of the GPU sampling
> would probably reduce our window.

True, although as I said, it's taking several µs to get through the
loop, and the gpu clock tick is far smaller than that, so even adding
the two values together to make it fit the current implementation won't
make the deviation that much larger.

> This leaves us with a delta of I + max(P(M), P(R), P(G)).  In
> particular, any two real-number valued times are, instantaneously,
> within that interval.

That, at least, would be easy to compute, and scale nicely if we added
more clocks in the future.

> Personally, I'm completely content to have the delta just be a the first
> one: a bound on the difference between any two real-valued times.  At this
> point, I can guarantee you that far more thought has been put into this
> mesa-dev discussion than was put into the spec and I think we're rapidly
> getting to the point of diminishing returns. :-)

It seems likely. How about we do the above computation for the current
code and leave it at that?
Jason Ekstrand Oct. 17, 2018, 3:34 p.m. UTC | #16
On Wed, Oct 17, 2018 at 12:14 AM Keith Packard <keithp@keithp.com> wrote:

> Jason Ekstrand <jason@jlekstrand.net> writes:
>
> > Doing all of the CPU sampling on one side or the other of the GPU
> sampling
> > would probably reduce our window.
>
> True, although as I said, it's taking several µs to get through the
> loop, and the gpu clock tick is far smaller than that, so even adding
> the two values together to make it fit the current implementation won't
> make the deviation that much larger.
>
> > This leaves us with a delta of I + max(P(M), P(R), P(G)).  In
> > particular, any two real-number valued times are, instantaneously,
> > within that interval.
>
> That, at least, would be easy to compute, and scale nicely if we added
> more clocks in the future.
>
> > Personally, I'm completely content to have the delta just be a the first
> > one: a bound on the difference between any two real-valued times.  At
> this
> > point, I can guarantee you that far more thought has been put into this
> > mesa-dev discussion than was put into the spec and I think we're rapidly
> > getting to the point of diminishing returns. :-)
>
> It seems likely. How about we do the above computation for the current
> code and leave it at that?
>

Sounds like a plan.  Note that I should be computed as I = end - start +
monotonic_raw_tick_ns to ensure we get a big enough interval.  Given that
monotonic_raw_tick_ns is likely 1, this doesn't expand things much.

I think a comment is likely also in order.  Probably not containing the
entire e-mail thread but maybe some of my reasoning above?

--Jason
<div dir="ltr"><div class="gmail_quote"><div dir="ltr">On Wed, Oct 17, 2018 at 12:14 AM Keith Packard &lt;<a href="mailto:keithp@keithp.com">keithp@keithp.com</a>&gt; wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Jason Ekstrand &lt;<a href="mailto:jason@jlekstrand.net" target="_blank">jason@jlekstrand.net</a>&gt; writes:<br>
<br>
&gt; Doing all of the CPU sampling on one side or the other of the GPU sampling<br>
&gt; would probably reduce our window.<br>
<br>
True, although as I said, it&#39;s taking several µs to get through the<br>
loop, and the gpu clock tick is far smaller than that, so even adding<br>
the two values together to make it fit the current implementation won&#39;t<br>
make the deviation that much larger.<br>
<br>
&gt; This leaves us with a delta of I + max(P(M), P(R), P(G)).  In<br>
&gt; particular, any two real-number valued times are, instantaneously,<br>
&gt; within that interval.<br>
<br>
That, at least, would be easy to compute, and scale nicely if we added<br>
more clocks in the future.<br>
<br>
&gt; Personally, I&#39;m completely content to have the delta just be a the first<br>
&gt; one: a bound on the difference between any two real-valued times.  At this<br>
&gt; point, I can guarantee you that far more thought has been put into this<br>
&gt; mesa-dev discussion than was put into the spec and I think we&#39;re rapidly<br>
&gt; getting to the point of diminishing returns. :-)<br>
<br>
It seems likely. How about we do the above computation for the current<br>
code and leave it at that?<br></blockquote><div><br></div><div>Sounds like a plan.  Note that I should be computed as I = end - start + monotonic_raw_tick_ns to ensure we get a big enough interval.  Given that monotonic_raw_tick_ns is likely 1, this doesn&#39;t expand things much.</div><div><br></div><div>I think a comment is likely also in order.  Probably not containing the entire e-mail thread but maybe some of my reasoning above?</div><div><br></div><div>--Jason<br></div></div></div>
diff mbox series

Patch

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 174922780fc..80050485e54 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -4955,3 +4955,84 @@  radv_GetDeviceGroupPeerMemoryFeatures(
 	                       VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
 	                       VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
 }
+
+static const VkTimeDomainEXT radv_time_domains[] = {
+	VK_TIME_DOMAIN_DEVICE_EXT,
+	VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
+	VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT,
+};
+
+VkResult radv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
+	VkPhysicalDevice                             physicalDevice,
+	uint32_t                                     *pTimeDomainCount,
+	VkTimeDomainEXT                              *pTimeDomains)
+{
+	int d;
+	VK_OUTARRAY_MAKE(out, pTimeDomains, pTimeDomainCount);
+
+	for (d = 0; d < ARRAY_SIZE(radv_time_domains); d++) {
+		vk_outarray_append(&out, i) {
+			*i = radv_time_domains[d];
+		}
+	}
+
+	return vk_outarray_status(&out);
+}
+
+static uint64_t
+radv_clock_gettime(clockid_t clock_id)
+{
+	struct timespec current;
+	int ret;
+
+	ret = clock_gettime(clock_id, &current);
+	if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW)
+		ret = clock_gettime(CLOCK_MONOTONIC, &current);
+	if (ret < 0)
+		return 0;
+
+	return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec;
+}
+
+VkResult radv_GetCalibratedTimestampsEXT(
+	VkDevice                                     _device,
+	uint32_t                                     timestampCount,
+	const VkCalibratedTimestampInfoEXT           *pTimestampInfos,
+	uint64_t                                     *pTimestamps,
+	uint64_t                                     *pMaxDeviation)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	uint32_t clock_crystal_freq = device->physical_device->rad_info.clock_crystal_freq;
+	int d;
+	uint64_t begin, end;
+
+	begin = radv_clock_gettime(CLOCK_MONOTONIC_RAW);
+
+	for (d = 0; d < timestampCount; d++) {
+		switch (pTimestampInfos[d].timeDomain) {
+		case VK_TIME_DOMAIN_DEVICE_EXT:
+			pTimestamps[d] = device->ws->query_value(device->ws,
+								 RADEON_TIMESTAMP);
+			break;
+		case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
+			pTimestamps[d] = radv_clock_gettime(CLOCK_MONOTONIC);
+			break;
+
+		case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
+			pTimestamps[d] = begin;
+			break;
+		default:
+			pTimestamps[d] = 0;
+			break;
+		}
+	}
+
+	end = radv_clock_gettime(CLOCK_MONOTONIC_RAW);
+
+	uint64_t clock_period = end - begin;
+	uint64_t device_period = DIV_ROUND_UP(1000000, clock_crystal_freq);
+
+	*pMaxDeviation = MAX2(clock_period, device_period);
+
+	return VK_SUCCESS;
+}
diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
index 5dcedae1c63..4c81d3f0068 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -92,6 +92,7 @@  EXTENSIONS = [
     Extension('VK_KHR_display',                          23, 'VK_USE_PLATFORM_DISPLAY_KHR'),
     Extension('VK_EXT_direct_mode_display',               1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
     Extension('VK_EXT_acquire_xlib_display',              1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
+    Extension('VK_EXT_calibrated_timestamps',             1, True),
     Extension('VK_EXT_conditional_rendering',             1, True),
     Extension('VK_EXT_conservative_rasterization',        1, 'device->rad_info.chip_class >= GFX9'),
     Extension('VK_EXT_display_surface_counter',           1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index a2551452eb1..249d7f36f0f 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -3021,6 +3021,95 @@  void anv_DestroyFramebuffer(
    vk_free2(&device->alloc, pAllocator, fb);
 }
 
+static const VkTimeDomainEXT anv_time_domains[] = {
+   VK_TIME_DOMAIN_DEVICE_EXT,
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT,
+};
+
+VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
+   VkPhysicalDevice                             physicalDevice,
+   uint32_t                                     *pTimeDomainCount,
+   VkTimeDomainEXT                              *pTimeDomains)
+{
+   int d;
+   VK_OUTARRAY_MAKE(out, pTimeDomains, pTimeDomainCount);
+
+   for (d = 0; d < ARRAY_SIZE(anv_time_domains); d++) {
+      vk_outarray_append(&out, i) {
+         *i = anv_time_domains[d];
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static uint64_t
+anv_clock_gettime(clockid_t clock_id)
+{
+   struct timespec current;
+   int ret;
+
+   ret = clock_gettime(clock_id, &current);
+   if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW)
+      ret = clock_gettime(CLOCK_MONOTONIC, &current);
+   if (ret < 0)
+      return 0;
+
+   return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec;
+}
+
+#define TIMESTAMP 0x2358
+
+VkResult anv_GetCalibratedTimestampsEXT(
+   VkDevice                                     _device,
+   uint32_t                                     timestampCount,
+   const VkCalibratedTimestampInfoEXT           *pTimestampInfos,
+   uint64_t                                     *pTimestamps,
+   uint64_t                                     *pMaxDeviation)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   uint64_t timestamp_frequency = device->info.timestamp_frequency;
+   int  ret;
+   int d;
+   uint64_t begin, end;
+
+   begin = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
+
+   for (d = 0; d < timestampCount; d++) {
+      switch (pTimestampInfos[d].timeDomain) {
+      case VK_TIME_DOMAIN_DEVICE_EXT:
+         ret = anv_gem_reg_read(device, TIMESTAMP | 1,
+                                &pTimestamps[d]);
+
+         if (ret != 0) {
+            device->lost = TRUE;
+            return VK_ERROR_DEVICE_LOST;
+         }
+         break;
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
+         pTimestamps[d] = anv_clock_gettime(CLOCK_MONOTONIC);
+         break;
+
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
+         pTimestamps[d] = begin;
+         break;
+      default:
+         pTimestamps[d] = 0;
+         break;
+      }
+   }
+
+   end = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
+
+   uint64_t clock_period = end - begin;
+   uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
+
+   *pMaxDeviation = MAX2(clock_period, device_period);
+
+   return VK_SUCCESS;
+}
+
 /* vk_icd.h does not declare this function, so we declare it here to
  * suppress Wmissing-prototypes.
  */
diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
index d4915c95013..a8535964da7 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -126,6 +126,7 @@  EXTENSIONS = [
     Extension('VK_EXT_vertex_attribute_divisor',          3, True),
     Extension('VK_EXT_post_depth_coverage',               1, 'device->info.gen >= 9'),
     Extension('VK_EXT_sampler_filter_minmax',             1, 'device->info.gen >= 9'),
+    Extension('VK_EXT_calibrated_timestamps',             1, True),
 ]
 
 class VkVersion:
diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c
index c43b5ef9e06..1bdf040c1a3 100644
--- a/src/intel/vulkan/anv_gem.c
+++ b/src/intel/vulkan/anv_gem.c
@@ -423,6 +423,19 @@  anv_gem_fd_to_handle(struct anv_device *device, int fd)
    return args.handle;
 }
 
+int
+anv_gem_reg_read(struct anv_device *device, uint32_t offset, uint64_t *result)
+{
+   struct drm_i915_reg_read args = {
+      .offset = offset
+   };
+
+   int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_REG_READ, &args);
+
+   *result = args.val;
+   return ret;
+}
+
 #ifndef SYNC_IOC_MAGIC
 /* duplicated from linux/sync_file.h to avoid build-time dependency
  * on new (v4.7) kernel headers.  Once distro's are mostly using
diff --git a/src/intel/vulkan/anv_gem_stubs.c b/src/intel/vulkan/anv_gem_stubs.c
index 5093bd5db1a..8cc3ad1f22e 100644
--- a/src/intel/vulkan/anv_gem_stubs.c
+++ b/src/intel/vulkan/anv_gem_stubs.c
@@ -251,3 +251,10 @@  anv_gem_syncobj_wait(struct anv_device *device,
 {
    unreachable("Unused");
 }
+
+int
+anv_gem_reg_read(struct anv_device *device,
+                 uint32_t offset, uint64_t *result)
+{
+   unreachable("Unused");
+}
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 599b903f25c..08376b00c8e 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1103,6 +1103,8 @@  int anv_gem_get_aperture(int fd, uint64_t *size);
 int anv_gem_gpu_get_reset_stats(struct anv_device *device,
                                 uint32_t *active, uint32_t *pending);
 int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
+int anv_gem_reg_read(struct anv_device *device,
+                     uint32_t offset, uint64_t *result);
 uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
 int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
 int anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,