diff mbox

[v2] xen/arm: flush icache as well when XEN_DOMCTL_cacheflush is issued

Message ID 20170127164545.6945-1-tamas.lengyel@zentific.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tamas Lengyel Jan. 27, 2017, 4:45 p.m. UTC
When the toolstack modifies memory of a running ARM VM it may happen
that the underlying memory of a current vCPU PC is changed. Without
flushing the icache the vCPU may continue executing stale instructions.

In this patch we introduce VA-based icache flushing macros. Also expose
the xc_domain_cacheflush through xenctrl.h.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@zentific.com>
---
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Julien Grall <julien.grall@arm.com>

Note: patch has been verified to solve stale icache issues on the
      HiKey platform.

v2: Return 0 on x86 and clarify comment in xenctrl.h
---
 tools/libxc/include/xenctrl.h    |  8 ++++++++
 tools/libxc/xc_domain.c          |  6 +++---
 tools/libxc/xc_private.h         |  3 ---
 xen/arch/arm/mm.c                |  1 +
 xen/include/asm-arm/arm32/page.h |  3 +++
 xen/include/asm-arm/arm64/page.h |  3 +++
 xen/include/asm-arm/page.h       | 31 +++++++++++++++++++++++++++++++
 7 files changed, 49 insertions(+), 6 deletions(-)

Comments

Wei Liu Jan. 27, 2017, 5:09 p.m. UTC | #1
On Fri, Jan 27, 2017 at 09:45:45AM -0700, Tamas K Lengyel wrote:
> When the toolstack modifies memory of a running ARM VM it may happen
> that the underlying memory of a current vCPU PC is changed. Without
> flushing the icache the vCPU may continue executing stale instructions.
> 
> In this patch we introduce VA-based icache flushing macros. Also expose
> the xc_domain_cacheflush through xenctrl.h.
> 
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@zentific.com>
> ---
> Cc: Ian Jackson <ian.jackson@eu.citrix.com>
> Cc: Wei Liu <wei.liu2@citrix.com>
> Cc: Stefano Stabellini <sstabellini@kernel.org>
> Cc: Julien Grall <julien.grall@arm.com>
> 
> Note: patch has been verified to solve stale icache issues on the
>       HiKey platform.
> 
> v2: Return 0 on x86 and clarify comment in xenctrl.h
> ---
>  tools/libxc/include/xenctrl.h    |  8 ++++++++
>  tools/libxc/xc_domain.c          |  6 +++---
>  tools/libxc/xc_private.h         |  3 ---

Acked-by: Wei Liu <wei.liu2@citrix.com>
Julien Grall Jan. 27, 2017, 5:25 p.m. UTC | #2
Hello Tamas,

Please give a bit more time to people for reviewing before sending a new 
version. Patches adding cache instruction are never easy to review.

On 27/01/17 16:45, Tamas K Lengyel wrote:
> When the toolstack modifies memory of a running ARM VM it may happen
> that the underlying memory of a current vCPU PC is changed. Without
> flushing the icache the vCPU may continue executing stale instructions.
> In this patch we introduce VA-based icache flushing macros. Also expose
> the xc_domain_cacheflush through xenctrl.h.
>
> Signed-off-by: Tamas K Lengyel <tamas.lengyel@zentific.com>
> ---
> Cc: Ian Jackson <ian.jackson@eu.citrix.com>
> Cc: Wei Liu <wei.liu2@citrix.com>
> Cc: Stefano Stabellini <sstabellini@kernel.org>
> Cc: Julien Grall <julien.grall@arm.com>
>
> Note: patch has been verified to solve stale icache issues on the
>       HiKey platform.

In the future, please include a reference to the ARM ARM to corroborate 
your testing. This would speed-up the review.

>
> v2: Return 0 on x86 and clarify comment in xenctrl.h
> ---
>  tools/libxc/include/xenctrl.h    |  8 ++++++++
>  tools/libxc/xc_domain.c          |  6 +++---
>  tools/libxc/xc_private.h         |  3 ---
>  xen/arch/arm/mm.c                |  1 +
>  xen/include/asm-arm/arm32/page.h |  3 +++
>  xen/include/asm-arm/arm64/page.h |  3 +++
>  xen/include/asm-arm/page.h       | 31 +++++++++++++++++++++++++++++++
>  7 files changed, 49 insertions(+), 6 deletions(-)
>
> diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
> index 63c616ff6a..a2f23fcd5a 100644
> --- a/tools/libxc/include/xenctrl.h
> +++ b/tools/libxc/include/xenctrl.h
> @@ -2720,6 +2720,14 @@ int xc_livepatch_revert(xc_interface *xch, char *name, uint32_t timeout);
>  int xc_livepatch_unload(xc_interface *xch, char *name, uint32_t timeout);
>  int xc_livepatch_replace(xc_interface *xch, char *name, uint32_t timeout);
>
> +/*
> + * Ensure cache coherency after memory modifications. A call to this function
> + * is only required on ARM as the x86 architecture provides cache coherency
> + * guarantees. Calling this function on x86 is allowed but has no effect.
> + */
> +int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
> +                         xen_pfn_t start_pfn, xen_pfn_t nr_pfns);
> +
>  /* Compat shims */
>  #include "xenctrl_compat.h"
>
> diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
> index 296b8523b5..98ab6ba3fd 100644
> --- a/tools/libxc/xc_domain.c
> +++ b/tools/libxc/xc_domain.c
> @@ -74,10 +74,10 @@ int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
>      /*
>       * The x86 architecture provides cache coherency guarantees which prevent
>       * the need for this hypercall.  Avoid the overhead of making a hypercall
> -     * just for Xen to return -ENOSYS.
> +     * just for Xen to return -ENOSYS.  It is safe to ignore this call on x86
> +     * so we just return 0.
>       */
> -    errno = ENOSYS;
> -    return -1;
> +    return 0;
>  #else
>      DECLARE_DOMCTL;
>      domctl.cmd = XEN_DOMCTL_cacheflush;
> diff --git a/tools/libxc/xc_private.h b/tools/libxc/xc_private.h
> index 97445ae1fe..fddebdc917 100644
> --- a/tools/libxc/xc_private.h
> +++ b/tools/libxc/xc_private.h
> @@ -366,9 +366,6 @@ void bitmap_byte_to_64(uint64_t *lp, const uint8_t *bp, int nbits);
>  /* Optionally flush file to disk and discard page cache */
>  void discard_file_cache(xc_interface *xch, int fd, int flush);
>
> -int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
> -			 xen_pfn_t start_pfn, xen_pfn_t nr_pfns);
> -
>  #define MAX_MMU_UPDATES 1024
>  struct xc_mmu {

>      mmu_update_t updates[MAX_MMU_UPDATES];
> diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
> index 99588a330d..43e5b3d9e2 100644
> --- a/xen/arch/arm/mm.c
> +++ b/xen/arch/arm/mm.c
> @@ -389,6 +389,7 @@ void flush_page_to_ram(unsigned long mfn)
>      void *v = map_domain_page(_mfn(mfn));
>
>      clean_and_invalidate_dcache_va_range(v, PAGE_SIZE);
> +    invalidate_icache_va_range(v, PAGE_SIZE);

I was about to say that the instruction cache flush would not be 
necessary for the current use case. But in fact, even if the domain is 
not yet running or we are allocating a page, we need to flush the I-Cache.

However, I am afraid that invalidating the I-Cache by VA range will not 
work as you expect on all platforms. This will highly depend on the 
behavior of the I-Cache (See D4.9.2 in ARM DDI 0487A.k_iss10775).

For some of the instruction cache (such as VIPT), you would need to 
flush the entire I-Cache to guarantee that all the aliases of a given 
physical address will be removed from the cache.

There are 2 options to fix the issue:
	1) Always flush the entire cache
	2) Either flush by VA or the entire cache depending of the I-cache 
implementation

I don't mind if you implement option 1 for now.

>      unmap_domain_page(v);
>  }
>
> diff --git a/xen/include/asm-arm/arm32/page.h b/xen/include/asm-arm/arm32/page.h
> index ea4b312c70..10e5288d0f 100644
> --- a/xen/include/asm-arm/arm32/page.h
> +++ b/xen/include/asm-arm/arm32/page.h
> @@ -19,6 +19,9 @@ static inline void write_pte(lpae_t *p, lpae_t pte)
>          : : "r" (pte.bits), "r" (p) : "memory");
>  }
>
> +/* Inline ASM to invalidate icache on register R (may be an inline asm operand) */
> +#define __invalidate_icache_one(R) STORE_CP32(R, ICIMVAU)

For ARM32, you would also need to invalidate the branch predictor.

> +
>  /* Inline ASM to invalidate dcache on register R (may be an inline asm operand) */
>  #define __invalidate_dcache_one(R) STORE_CP32(R, DCIMVAC)
>
> diff --git a/xen/include/asm-arm/arm64/page.h b/xen/include/asm-arm/arm64/page.h
> index 23d778154d..0f380b95b4 100644
> --- a/xen/include/asm-arm/arm64/page.h
> +++ b/xen/include/asm-arm/arm64/page.h
> @@ -16,6 +16,9 @@ static inline void write_pte(lpae_t *p, lpae_t pte)
>          : : "r" (pte.bits), "r" (p) : "memory");
>  }
>
> +/* Inline ASM to invalidate icache on register R (may be an inline asm operand) */
> +#define __invalidate_icache_one(R) "ic ivau, %" #R ";"
> +
>  /* Inline ASM to invalidate dcache on register R (may be an inline asm operand) */
>  #define __invalidate_dcache_one(R) "dc ivac, %" #R ";"
>
> diff --git a/xen/include/asm-arm/page.h b/xen/include/asm-arm/page.h
> index c492d6df50..a618d0e556 100644
> --- a/xen/include/asm-arm/page.h
> +++ b/xen/include/asm-arm/page.h
> @@ -371,6 +371,37 @@ static inline int clean_and_invalidate_dcache_va_range
>              : : "r" (_p), "m" (*_p));                                   \
>  } while (0)
>
> +static inline int invalidate_icache_va_range(const void *p, unsigned long size)
> +{
> +    size_t off;
> +    const void *end = p + size;
> +
> +    dsb(sy);           /* So the CPU issues all writes to the range */

The invalidation will be happen on the innershareable domain, so 
dsb(ish) is enough here.

> +
> +    off = (unsigned long)p % cacheline_bytes;

cacheline_bytes contains the cacheline size of the data cache, not 
instruction cache.

> +    if ( off )
> +    {
> +        p -= off;
> +        asm volatile (__invalidate_icache_one(0) : : "r" (p));
> +        p += cacheline_bytes;
> +        size -= cacheline_bytes - off;
> +    }
> +    off = (unsigned long)end % cacheline_bytes;
> +    if ( off )
> +    {
> +        end -= off;
> +        size -= off;
> +        asm volatile (__invalidate_icache_one(0) : : "r" (end));
> +    }
> +
> +    for ( ; p < end; p += cacheline_bytes )
> +        asm volatile (__invalidate_icache_one(0) : : "r" (p));
> +
> +    dsb(sy);           /* So we know the flushes happen before continuing */

dsb(ish)

> +
> +    return 0;
> +}
> +
>  /*
>   * Flush a range of VA's hypervisor mappings from the data TLB of the
>   * local processor. This is not sufficient when changing code mappings
>

Regards,
Tamas Lengyel Jan. 27, 2017, 6:04 p.m. UTC | #3
On Fri, Jan 27, 2017 at 10:25 AM, Julien Grall <julien.grall@arm.com> wrote:
> Hello Tamas,
>
> Please give a bit more time to people for reviewing before sending a new
> version. Patches adding cache instruction are never easy to review.
>
> On 27/01/17 16:45, Tamas K Lengyel wrote:
>>
>> When the toolstack modifies memory of a running ARM VM it may happen
>> that the underlying memory of a current vCPU PC is changed. Without
>> flushing the icache the vCPU may continue executing stale instructions.
>> In this patch we introduce VA-based icache flushing macros. Also expose
>> the xc_domain_cacheflush through xenctrl.h.
>>
>> Signed-off-by: Tamas K Lengyel <tamas.lengyel@zentific.com>
>> ---
>> Cc: Ian Jackson <ian.jackson@eu.citrix.com>
>> Cc: Wei Liu <wei.liu2@citrix.com>
>> Cc: Stefano Stabellini <sstabellini@kernel.org>
>> Cc: Julien Grall <julien.grall@arm.com>
>>
>> Note: patch has been verified to solve stale icache issues on the
>>       HiKey platform.
>
>
> In the future, please include a reference to the ARM ARM to corroborate your
> testing. This would speed-up the review.

Ack.

>
>
>>
>> v2: Return 0 on x86 and clarify comment in xenctrl.h
>> ---
>>  tools/libxc/include/xenctrl.h    |  8 ++++++++
>>  tools/libxc/xc_domain.c          |  6 +++---
>>  tools/libxc/xc_private.h         |  3 ---
>>  xen/arch/arm/mm.c                |  1 +
>>  xen/include/asm-arm/arm32/page.h |  3 +++
>>  xen/include/asm-arm/arm64/page.h |  3 +++
>>  xen/include/asm-arm/page.h       | 31 +++++++++++++++++++++++++++++++
>>  7 files changed, 49 insertions(+), 6 deletions(-)
>>
>> diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
>> index 63c616ff6a..a2f23fcd5a 100644
>> --- a/tools/libxc/include/xenctrl.h
>> +++ b/tools/libxc/include/xenctrl.h
>> @@ -2720,6 +2720,14 @@ int xc_livepatch_revert(xc_interface *xch, char
>> *name, uint32_t timeout);
>>  int xc_livepatch_unload(xc_interface *xch, char *name, uint32_t timeout);
>>  int xc_livepatch_replace(xc_interface *xch, char *name, uint32_t
>> timeout);
>>
>> +/*
>> + * Ensure cache coherency after memory modifications. A call to this
>> function
>> + * is only required on ARM as the x86 architecture provides cache
>> coherency
>> + * guarantees. Calling this function on x86 is allowed but has no effect.
>> + */
>> +int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
>> +                         xen_pfn_t start_pfn, xen_pfn_t nr_pfns);
>> +
>>  /* Compat shims */
>>  #include "xenctrl_compat.h"
>>
>> diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
>> index 296b8523b5..98ab6ba3fd 100644
>> --- a/tools/libxc/xc_domain.c
>> +++ b/tools/libxc/xc_domain.c
>> @@ -74,10 +74,10 @@ int xc_domain_cacheflush(xc_interface *xch, uint32_t
>> domid,
>>      /*
>>       * The x86 architecture provides cache coherency guarantees which
>> prevent
>>       * the need for this hypercall.  Avoid the overhead of making a
>> hypercall
>> -     * just for Xen to return -ENOSYS.
>> +     * just for Xen to return -ENOSYS.  It is safe to ignore this call on
>> x86
>> +     * so we just return 0.
>>       */
>> -    errno = ENOSYS;
>> -    return -1;
>> +    return 0;
>>  #else
>>      DECLARE_DOMCTL;
>>      domctl.cmd = XEN_DOMCTL_cacheflush;
>> diff --git a/tools/libxc/xc_private.h b/tools/libxc/xc_private.h
>> index 97445ae1fe..fddebdc917 100644
>> --- a/tools/libxc/xc_private.h
>> +++ b/tools/libxc/xc_private.h
>> @@ -366,9 +366,6 @@ void bitmap_byte_to_64(uint64_t *lp, const uint8_t
>> *bp, int nbits);
>>  /* Optionally flush file to disk and discard page cache */
>>  void discard_file_cache(xc_interface *xch, int fd, int flush);
>>
>> -int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
>> -                        xen_pfn_t start_pfn, xen_pfn_t nr_pfns);
>> -
>>  #define MAX_MMU_UPDATES 1024
>>  struct xc_mmu {
>
>
>>      mmu_update_t updates[MAX_MMU_UPDATES];
>> diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
>> index 99588a330d..43e5b3d9e2 100644
>> --- a/xen/arch/arm/mm.c
>> +++ b/xen/arch/arm/mm.c
>> @@ -389,6 +389,7 @@ void flush_page_to_ram(unsigned long mfn)
>>      void *v = map_domain_page(_mfn(mfn));
>>
>>      clean_and_invalidate_dcache_va_range(v, PAGE_SIZE);
>> +    invalidate_icache_va_range(v, PAGE_SIZE);
>
>
> I was about to say that the instruction cache flush would not be necessary
> for the current use case. But in fact, even if the domain is not yet running
> or we are allocating a page, we need to flush the I-Cache.
>
> However, I am afraid that invalidating the I-Cache by VA range will not work
> as you expect on all platforms. This will highly depend on the behavior of
> the I-Cache (See D4.9.2 in ARM DDI 0487A.k_iss10775).
>
> For some of the instruction cache (such as VIPT), you would need to flush
> the entire I-Cache to guarantee that all the aliases of a given physical
> address will be removed from the cache.
>
> There are 2 options to fix the issue:
>         1) Always flush the entire cache
>         2) Either flush by VA or the entire cache depending of the I-cache
> implementation
>
> I don't mind if you implement option 1 for now.

That is certainly fine by me. Thanks for looking into this!

Tamas
diff mbox

Patch

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 63c616ff6a..a2f23fcd5a 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2720,6 +2720,14 @@  int xc_livepatch_revert(xc_interface *xch, char *name, uint32_t timeout);
 int xc_livepatch_unload(xc_interface *xch, char *name, uint32_t timeout);
 int xc_livepatch_replace(xc_interface *xch, char *name, uint32_t timeout);
 
+/*
+ * Ensure cache coherency after memory modifications. A call to this function
+ * is only required on ARM as the x86 architecture provides cache coherency
+ * guarantees. Calling this function on x86 is allowed but has no effect.
+ */
+int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
+                         xen_pfn_t start_pfn, xen_pfn_t nr_pfns);
+
 /* Compat shims */
 #include "xenctrl_compat.h"
 
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index 296b8523b5..98ab6ba3fd 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -74,10 +74,10 @@  int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
     /*
      * The x86 architecture provides cache coherency guarantees which prevent
      * the need for this hypercall.  Avoid the overhead of making a hypercall
-     * just for Xen to return -ENOSYS.
+     * just for Xen to return -ENOSYS.  It is safe to ignore this call on x86
+     * so we just return 0.
      */
-    errno = ENOSYS;
-    return -1;
+    return 0;
 #else
     DECLARE_DOMCTL;
     domctl.cmd = XEN_DOMCTL_cacheflush;
diff --git a/tools/libxc/xc_private.h b/tools/libxc/xc_private.h
index 97445ae1fe..fddebdc917 100644
--- a/tools/libxc/xc_private.h
+++ b/tools/libxc/xc_private.h
@@ -366,9 +366,6 @@  void bitmap_byte_to_64(uint64_t *lp, const uint8_t *bp, int nbits);
 /* Optionally flush file to disk and discard page cache */
 void discard_file_cache(xc_interface *xch, int fd, int flush);
 
-int xc_domain_cacheflush(xc_interface *xch, uint32_t domid,
-			 xen_pfn_t start_pfn, xen_pfn_t nr_pfns);
-
 #define MAX_MMU_UPDATES 1024
 struct xc_mmu {
     mmu_update_t updates[MAX_MMU_UPDATES];
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index 99588a330d..43e5b3d9e2 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -389,6 +389,7 @@  void flush_page_to_ram(unsigned long mfn)
     void *v = map_domain_page(_mfn(mfn));
 
     clean_and_invalidate_dcache_va_range(v, PAGE_SIZE);
+    invalidate_icache_va_range(v, PAGE_SIZE);
     unmap_domain_page(v);
 }
 
diff --git a/xen/include/asm-arm/arm32/page.h b/xen/include/asm-arm/arm32/page.h
index ea4b312c70..10e5288d0f 100644
--- a/xen/include/asm-arm/arm32/page.h
+++ b/xen/include/asm-arm/arm32/page.h
@@ -19,6 +19,9 @@  static inline void write_pte(lpae_t *p, lpae_t pte)
         : : "r" (pte.bits), "r" (p) : "memory");
 }
 
+/* Inline ASM to invalidate icache on register R (may be an inline asm operand) */
+#define __invalidate_icache_one(R) STORE_CP32(R, ICIMVAU)
+
 /* Inline ASM to invalidate dcache on register R (may be an inline asm operand) */
 #define __invalidate_dcache_one(R) STORE_CP32(R, DCIMVAC)
 
diff --git a/xen/include/asm-arm/arm64/page.h b/xen/include/asm-arm/arm64/page.h
index 23d778154d..0f380b95b4 100644
--- a/xen/include/asm-arm/arm64/page.h
+++ b/xen/include/asm-arm/arm64/page.h
@@ -16,6 +16,9 @@  static inline void write_pte(lpae_t *p, lpae_t pte)
         : : "r" (pte.bits), "r" (p) : "memory");
 }
 
+/* Inline ASM to invalidate icache on register R (may be an inline asm operand) */
+#define __invalidate_icache_one(R) "ic ivau, %" #R ";"
+
 /* Inline ASM to invalidate dcache on register R (may be an inline asm operand) */
 #define __invalidate_dcache_one(R) "dc ivac, %" #R ";"
 
diff --git a/xen/include/asm-arm/page.h b/xen/include/asm-arm/page.h
index c492d6df50..a618d0e556 100644
--- a/xen/include/asm-arm/page.h
+++ b/xen/include/asm-arm/page.h
@@ -371,6 +371,37 @@  static inline int clean_and_invalidate_dcache_va_range
             : : "r" (_p), "m" (*_p));                                   \
 } while (0)
 
+static inline int invalidate_icache_va_range(const void *p, unsigned long size)
+{
+    size_t off;
+    const void *end = p + size;
+
+    dsb(sy);           /* So the CPU issues all writes to the range */
+
+    off = (unsigned long)p % cacheline_bytes;
+    if ( off )
+    {
+        p -= off;
+        asm volatile (__invalidate_icache_one(0) : : "r" (p));
+        p += cacheline_bytes;
+        size -= cacheline_bytes - off;
+    }
+    off = (unsigned long)end % cacheline_bytes;
+    if ( off )
+    {
+        end -= off;
+        size -= off;
+        asm volatile (__invalidate_icache_one(0) : : "r" (end));
+    }
+
+    for ( ; p < end; p += cacheline_bytes )
+        asm volatile (__invalidate_icache_one(0) : : "r" (p));
+
+    dsb(sy);           /* So we know the flushes happen before continuing */
+
+    return 0;
+}
+
 /*
  * Flush a range of VA's hypervisor mappings from the data TLB of the
  * local processor. This is not sufficient when changing code mappings