Patchworkβ sh: sh4a: Cache optimization if no cache alias

login
register
about
Submitter Valentin R Sitsikov
Date 2009-10-14 12:51:52
Message ID <4AD5C968.1030005@siemens.com>
Download mbox | patch
Permalink /patch/53661/
State Under Review
Headers show

Comments

Valentin R Sitsikov - 2009-10-14 12:51:52
Signed-off-by: Valentin Sitdikov <valentin.sitdikov@siemens.com>
---
 arch/sh/include/asm/system_32.h |    2 +-
 arch/sh/mm/Makefile             |    1 +
 arch/sh/mm/cache-sh4a.c         |  169 
+++++++++++++++++++++++++++++++++++++++
 arch/sh/mm/cache.c              |    6 ++
 4 files changed, 177 insertions(+), 1 deletions(-)
 create mode 100644 arch/sh/mm/cache-sh4a.c
Valentin R Sitsikov - 2009-10-28 08:11:11
Hello Paul!
If you don`t mind could you please comment on this patch?

Best regards,
Valentin
Valentin R Sitsikov wrote:
> Signed-off-by: Valentin Sitdikov <valentin.sitdikov@siemens.com>
> ---
> arch/sh/include/asm/system_32.h |    2 +-
> arch/sh/mm/Makefile             |    1 +
> arch/sh/mm/cache-sh4a.c         |  169
> +++++++++++++++++++++++++++++++++++++++
> arch/sh/mm/cache.c              |    6 ++
> 4 files changed, 177 insertions(+), 1 deletions(-)
> create mode 100644 arch/sh/mm/cache-sh4a.c
> 
> diff --git a/arch/sh/include/asm/system_32.h
> b/arch/sh/include/asm/system_32.h
> index 607d413..7fe8011 100644
> --- a/arch/sh/include/asm/system_32.h
> +++ b/arch/sh/include/asm/system_32.h
> @@ -72,7 +72,7 @@ do {                                    \
> #define __ocbp(addr)    __asm__ __volatile__ ( "ocbp @%0\n\t" : : "r"
> (addr))
> #define __ocbi(addr)    __asm__ __volatile__ ( "ocbi @%0\n\t" : : "r"
> (addr))
> #define __ocbwb(addr)    __asm__ __volatile__ ( "ocbwb @%0\n\t" : : "r"
> (addr))
> -
> +#define __icbi(addr)    __asm__ __volatile__ ( "icbi @%0\n\t" : : "r"
> (addr))
> struct task_struct *__switch_to(struct task_struct *prev,
>                 struct task_struct *next);
> 
> diff --git a/arch/sh/mm/Makefile b/arch/sh/mm/Makefile
> index b70024d..3a2de1d 100644
> --- a/arch/sh/mm/Makefile
> +++ b/arch/sh/mm/Makefile
> @@ -10,6 +10,7 @@ cacheops-$(CONFIG_CPU_SH3)        := cache-sh3.o
> cacheops-$(CONFIG_CPU_SH4)        := cache-sh4.o flush-sh4.o
> cacheops-$(CONFIG_CPU_SH5)        := cache-sh5.o flush-sh4.o
> cacheops-$(CONFIG_SH7705_CACHE_32KB)    += cache-sh7705.o
> +cacheops-$(CONFIG_CPU_SH4A)        += cache-sh4a.o
> 
> obj-y            += $(cacheops-y)
> 
> diff --git a/arch/sh/mm/cache-sh4a.c b/arch/sh/mm/cache-sh4a.c
> new file mode 100644
> index 0000000..147f0e3
> --- /dev/null
> +++ b/arch/sh/mm/cache-sh4a.c
> @@ -0,0 +1,169 @@
> +/*
> + * arch/sh/mm/cache-sh4a.c
> + *
> + * Copyright (C) 1999, 2000, 2002  Niibe Yutaka
> + * Copyright (C) 2001 - 2009  Paul Mundt
> + * Copyright (C) 2003  Richard Curnow
> + * Copyright (c) 2007 STMicroelectronics (R&D) Ltd.
> + * Copyright (c) 2009 Valentin Sitdikov
> + *
> + * This file is subject to the terms and conditions of the GNU General
> Public
> + * License.  See the file "COPYING" in the main directory of this archive
> + * for more details.
> + */
> +#include <linux/init.h>
> +#include <linux/mm.h>
> +#include <linux/io.h>
> +#include <linux/mutex.h>
> +#include <linux/fs.h>
> +#include <linux/highmem.h>
> +#include <linux/pagemap.h>
> +#include <asm/pgtable.h>
> +#include <asm/mmu_context.h>
> +#include <asm/cacheflush.h>
> +
> +/*
> + * The maximum number of pages we support up to when doing ranged dcache
> + * flushing. Anything exceeding this will simply flush the dcache in its
> + * entirety.
> + */
> +#define MAX_ICACHE_PAGES    32
> +
> +
> +static void sh4a_invalidate_icache(void *start, int size)
> +{
> +    reg_size_t aligned_start, v, cnt, end;
> +
> +    aligned_start = register_align(start);
> +    v = aligned_start & ~(L1_CACHE_BYTES-1);
> +    end = (aligned_start + size + L1_CACHE_BYTES-1)
> +        & ~(L1_CACHE_BYTES-1);
> +    cnt = (end - v) / L1_CACHE_BYTES;
> +
> +    while (cnt >= 8) {
> +        __icbi(v); v += L1_CACHE_BYTES;
> +        __icbi(v); v += L1_CACHE_BYTES;
> +        __icbi(v); v += L1_CACHE_BYTES;
> +        __icbi(v); v += L1_CACHE_BYTES;
> +        __icbi(v); v += L1_CACHE_BYTES;
> +        __icbi(v); v += L1_CACHE_BYTES;
> +        __icbi(v); v += L1_CACHE_BYTES;
> +        __icbi(v); v += L1_CACHE_BYTES;
> +        cnt -= 8;
> +    }
> +
> +    while (cnt) {
> +        __icbi(v); v += L1_CACHE_BYTES;
> +        cnt--;
> +    }
> +}
> +
> +/*
> + * Write back the dirty D-caches and invalidate them.
> + *
> + * START: Virtual Address (U0, P1, or P3)
> + * SIZE: Size of the region.
> + */
> +static void sh4a_purge_dcache(void *start, int size)
> +{
> +    reg_size_t aligned_start, v, cnt, end;
> +
> +    aligned_start = register_align(start);
> +    v = aligned_start & ~(L1_CACHE_BYTES-1);
> +    end = (aligned_start + size + L1_CACHE_BYTES-1)
> +        & ~(L1_CACHE_BYTES-1);
> +    cnt = (end - v) / L1_CACHE_BYTES;
> +
> +    while (cnt >= 8) {
> +        __ocbp(v); v += L1_CACHE_BYTES;
> +        __ocbp(v); v += L1_CACHE_BYTES;
> +        __ocbp(v); v += L1_CACHE_BYTES;
> +        __ocbp(v); v += L1_CACHE_BYTES;
> +        __ocbp(v); v += L1_CACHE_BYTES;
> +        __ocbp(v); v += L1_CACHE_BYTES;
> +        __ocbp(v); v += L1_CACHE_BYTES;
> +        __ocbp(v); v += L1_CACHE_BYTES;
> +        cnt -= 8;
> +    }
> +    while (cnt) {
> +        __ocbp(v); v += L1_CACHE_BYTES;
> +        cnt--;
> +    }
> +}
> +
> +/*
> + * Write back the range of D-cache, and purge the I-cache.
> + *
> + * Called from kernel/module.c:sys_init_module and routine for a.out
> format,
> + * signal handler code and kprobes code
> + */
> +static void __uses_jump_to_uncached sh4a_flush_icache_range(void *args)
> +{
> +    struct flusher_data *data = args;
> +    unsigned long start, end;
> +    unsigned long flags, v;
> +
> +    start = data->addr1;
> +    end = data->addr2;
> +
> +    /* If there are too many pages then just blow away the caches */
> +    if (((end - start) >> PAGE_SHIFT) >= MAX_ICACHE_PAGES) {
> +        local_flush_cache_all(NULL);
> +        return;
> +    }
> +
> +    /*
> +     * Selectively flush d-cache then invalidate the i-cache.
> +     * This is inefficient, so only use this for small ranges.
> +     */
> +    start &= ~(L1_CACHE_BYTES-1);
> +    end += L1_CACHE_BYTES-1;
> +    end &= ~(L1_CACHE_BYTES-1);
> +
> +    local_irq_save(flags);
> +    jump_to_uncached();
> +
> +    for (v = start; v < end; v += L1_CACHE_BYTES) {
> +        __ocbwb(v);
> +        __icbi(v);
> +    }
> +
> +    back_to_cached();
> +    local_irq_restore(flags);
> +}
> +
> +/*
> + * Write back & invalidate the D-cache of the page.
> + * (To avoid "alias" issues)
> + */
> +static void sh4a_flush_dcache_page(void *arg)
> +{
> +    struct page *page = arg;
> +    struct address_space *mapping = page_mapping(page);
> +
> +#ifndef CONFIG_SMP
> +    if (mapping && !mapping_mapped(mapping))
> +        set_bit(PG_dcache_dirty, &page->flags);
> +    else
> +#endif
> +    {
> +        sh4a_purge_dcache(page_address(page), PAGE_SIZE);
> +        sh4a_invalidate_icache(page_address(page), PAGE_SIZE);
> +    }
> +}
> +
> +
> +/*
> + * SH-4 has virtually indexed and physically tagged cache.
> + */
> +void __init sh4a_cache_init(void)
> +{
> +    printk("SH4A cache optimization\n");
> +
> +    local_flush_icache_range    = sh4a_flush_icache_range;
> +    /* Not sure about alias cases - not checked yet */
> +    if (boot_cpu_data.dcache.n_aliases == 0) {
> +        local_flush_dcache_page        = sh4a_flush_dcache_page;
> +    }
> +
> +}
> diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
> index 4aa9260..72904d9 100644
> --- a/arch/sh/mm/cache.c
> +++ b/arch/sh/mm/cache.c
> @@ -310,6 +310,12 @@ void __init cpu_cache_init(void)
>         extern void __weak sh4_cache_init(void);
> 
>         sh4_cache_init();
> +
> +        if(boot_cpu_data.family == CPU_FAMILY_SH4A) {
> +            extern void __weak sh4a_cache_init(void);
> +
> +            sh4a_cache_init();
> +        }
>     }
> 
>     if (boot_cpu_data.family == CPU_FAMILY_SH5) {

--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mundt - 2009-10-28 09:28:39
On Wed, Oct 28, 2009 at 11:11:11AM +0300, Valentin R Sitsikov wrote:
> Hello Paul!
> If you don`t mind could you please comment on this patch?
> 
Sorry I haven't gotten around to this yet, it's certainly on my to-review
list. Last week was quite busy with kernel summit and the Japan linux
symposium, so I'm still catching up. I'll try and get to it before the
end of the week.
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Matt Fleming - 2009-11-05 23:22:49
On Wed, Oct 14, 2009 at 04:51:52PM +0400, Valentin R Sitsikov wrote:
> Signed-off-by: Valentin Sitdikov <valentin.sitdikov@siemens.com>
> ---
> arch/sh/include/asm/system_32.h |    2 +-
> arch/sh/mm/Makefile             |    1 +
> arch/sh/mm/cache-sh4a.c         |  169  
> +++++++++++++++++++++++++++++++++++++++

Your email client appears to have word wrapped this patch which means
that it does not apply cleanly.

> arch/sh/mm/cache.c              |    6 ++
> 4 files changed, 177 insertions(+), 1 deletions(-)
> create mode 100644 arch/sh/mm/cache-sh4a.c
>
> diff --git a/arch/sh/include/asm/system_32.h  
> b/arch/sh/include/asm/system_32.h
> index 607d413..7fe8011 100644
> --- a/arch/sh/include/asm/system_32.h
> +++ b/arch/sh/include/asm/system_32.h
> @@ -72,7 +72,7 @@ do {                                    \
> #define __ocbp(addr)    __asm__ __volatile__ ( "ocbp @%0\n\t" : : "r"  
> (addr))
> #define __ocbi(addr)    __asm__ __volatile__ ( "ocbi @%0\n\t" : : "r"  
> (addr))
> #define __ocbwb(addr)    __asm__ __volatile__ ( "ocbwb @%0\n\t" : : "r"  
> (addr))
> -
> +#define __icbi(addr)    __asm__ __volatile__ ( "icbi @%0\n\t" : : "r"  
> (addr))

I don't think this change is correct. the icbi instruction is only
available on SH4-A and there is already an __icbi() placeholder for
non-SH4A CPUs. This will break all non-SH4A builds,

arch/sh/include/asm/system_32.h:75: error: "__icbi" redefined
arch/sh/include/asm/system_32.h:69: note: this is the location of the previous definition

[...]

> +/*
> + * Write back & invalidate the D-cache of the page.
> + * (To avoid "alias" issues)
> + */
> +static void sh4a_flush_dcache_page(void *arg)
> +{
> +    struct page *page = arg;
> +    struct address_space *mapping = page_mapping(page);
> +
> +#ifndef CONFIG_SMP
> +    if (mapping && !mapping_mapped(mapping))
> +        set_bit(PG_dcache_dirty, &page->flags);
> +    else
> +#endif
> +    {
> +        sh4a_purge_dcache(page_address(page), PAGE_SIZE);
> +        sh4a_invalidate_icache(page_address(page), PAGE_SIZE);
> +    }

Is there a reason why you are also invalidating the icache here? I think
that only the dcache needs to be written-back and invalidated.

> +/*
> + * SH-4 has virtually indexed and physically tagged cache.
> + */
> +void __init sh4a_cache_init(void)
> +{
> +    printk("SH4A cache optimization\n");
> +
> +    local_flush_icache_range    = sh4a_flush_icache_range;
> +    /* Not sure about alias cases - not checked yet */
> +    if (boot_cpu_data.dcache.n_aliases == 0) {
> +        local_flush_dcache_page        = sh4a_flush_dcache_page;
> +    }
> +
> +}

It is possible for the icache to have aliases too, so you should
probably only use the sh4a optimized versions if there are no icache
aliases. Also, it would be a good idea to move the printk() so that the
"SH4A cache optimization" string is only printed if there are no aliases
in the cache and we're actually using the optimized versions.
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/sh/include/asm/system_32.h 
b/arch/sh/include/asm/system_32.h
index 607d413..7fe8011 100644
--- a/arch/sh/include/asm/system_32.h
+++ b/arch/sh/include/asm/system_32.h
@@ -72,7 +72,7 @@  do {                                    \
 #define __ocbp(addr)    __asm__ __volatile__ ( "ocbp @%0\n\t" : : "r" 
(addr))
 #define __ocbi(addr)    __asm__ __volatile__ ( "ocbi @%0\n\t" : : "r" 
(addr))
 #define __ocbwb(addr)    __asm__ __volatile__ ( "ocbwb @%0\n\t" : : "r" 
(addr))
-
+#define __icbi(addr)    __asm__ __volatile__ ( "icbi @%0\n\t" : : "r" 
(addr))
 struct task_struct *__switch_to(struct task_struct *prev,
                 struct task_struct *next);
 
diff --git a/arch/sh/mm/Makefile b/arch/sh/mm/Makefile
index b70024d..3a2de1d 100644
--- a/arch/sh/mm/Makefile
+++ b/arch/sh/mm/Makefile
@@ -10,6 +10,7 @@  cacheops-$(CONFIG_CPU_SH3)        := cache-sh3.o
 cacheops-$(CONFIG_CPU_SH4)        := cache-sh4.o flush-sh4.o
 cacheops-$(CONFIG_CPU_SH5)        := cache-sh5.o flush-sh4.o
 cacheops-$(CONFIG_SH7705_CACHE_32KB)    += cache-sh7705.o
+cacheops-$(CONFIG_CPU_SH4A)        += cache-sh4a.o
 
 obj-y            += $(cacheops-y)
 
diff --git a/arch/sh/mm/cache-sh4a.c b/arch/sh/mm/cache-sh4a.c
new file mode 100644
index 0000000..147f0e3
--- /dev/null
+++ b/arch/sh/mm/cache-sh4a.c
@@ -0,0 +1,169 @@ 
+/*
+ * arch/sh/mm/cache-sh4a.c
+ *
+ * Copyright (C) 1999, 2000, 2002  Niibe Yutaka
+ * Copyright (C) 2001 - 2009  Paul Mundt
+ * Copyright (C) 2003  Richard Curnow
+ * Copyright (c) 2007 STMicroelectronics (R&D) Ltd.
+ * Copyright (c) 2009 Valentin Sitdikov
+ *
+ * This file is subject to the terms and conditions of the GNU General 
Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+
+/*
+ * The maximum number of pages we support up to when doing ranged dcache
+ * flushing. Anything exceeding this will simply flush the dcache in its
+ * entirety.
+ */
+#define MAX_ICACHE_PAGES    32
+
+
+static void sh4a_invalidate_icache(void *start, int size)
+{
+    reg_size_t aligned_start, v, cnt, end;
+
+    aligned_start = register_align(start);
+    v = aligned_start & ~(L1_CACHE_BYTES-1);
+    end = (aligned_start + size + L1_CACHE_BYTES-1)
+        & ~(L1_CACHE_BYTES-1);
+    cnt = (end - v) / L1_CACHE_BYTES;
+
+    while (cnt >= 8) {
+        __icbi(v); v += L1_CACHE_BYTES;
+        __icbi(v); v += L1_CACHE_BYTES;
+        __icbi(v); v += L1_CACHE_BYTES;
+        __icbi(v); v += L1_CACHE_BYTES;
+        __icbi(v); v += L1_CACHE_BYTES;
+        __icbi(v); v += L1_CACHE_BYTES;
+        __icbi(v); v += L1_CACHE_BYTES;
+        __icbi(v); v += L1_CACHE_BYTES;
+        cnt -= 8;
+    }
+
+    while (cnt) {
+        __icbi(v); v += L1_CACHE_BYTES;
+        cnt--;
+    }
+}
+
+/*
+ * Write back the dirty D-caches and invalidate them.
+ *
+ * START: Virtual Address (U0, P1, or P3)
+ * SIZE: Size of the region.
+ */
+static void sh4a_purge_dcache(void *start, int size)
+{
+    reg_size_t aligned_start, v, cnt, end;
+
+    aligned_start = register_align(start);
+    v = aligned_start & ~(L1_CACHE_BYTES-1);
+    end = (aligned_start + size + L1_CACHE_BYTES-1)
+        & ~(L1_CACHE_BYTES-1);
+    cnt = (end - v) / L1_CACHE_BYTES;
+
+    while (cnt >= 8) {
+        __ocbp(v); v += L1_CACHE_BYTES;
+        __ocbp(v); v += L1_CACHE_BYTES;
+        __ocbp(v); v += L1_CACHE_BYTES;
+        __ocbp(v); v += L1_CACHE_BYTES;
+        __ocbp(v); v += L1_CACHE_BYTES;
+        __ocbp(v); v += L1_CACHE_BYTES;
+        __ocbp(v); v += L1_CACHE_BYTES;
+        __ocbp(v); v += L1_CACHE_BYTES;
+        cnt -= 8;
+    }
+    while (cnt) {
+        __ocbp(v); v += L1_CACHE_BYTES;
+        cnt--;
+    }
+}
+
+/*
+ * Write back the range of D-cache, and purge the I-cache.
+ *
+ * Called from kernel/module.c:sys_init_module and routine for a.out 
format,
+ * signal handler code and kprobes code
+ */
+static void __uses_jump_to_uncached sh4a_flush_icache_range(void *args)
+{
+    struct flusher_data *data = args;
+    unsigned long start, end;
+    unsigned long flags, v;
+
+    start = data->addr1;
+    end = data->addr2;
+
+    /* If there are too many pages then just blow away the caches */
+    if (((end - start) >> PAGE_SHIFT) >= MAX_ICACHE_PAGES) {
+        local_flush_cache_all(NULL);
+        return;
+    }
+
+    /*
+     * Selectively flush d-cache then invalidate the i-cache.
+     * This is inefficient, so only use this for small ranges.
+     */
+    start &= ~(L1_CACHE_BYTES-1);
+    end += L1_CACHE_BYTES-1;
+    end &= ~(L1_CACHE_BYTES-1);
+
+    local_irq_save(flags);
+    jump_to_uncached();
+
+    for (v = start; v < end; v += L1_CACHE_BYTES) {
+        __ocbwb(v);
+        __icbi(v);
+    }
+
+    back_to_cached();
+    local_irq_restore(flags);
+}
+
+/*
+ * Write back & invalidate the D-cache of the page.
+ * (To avoid "alias" issues)
+ */
+static void sh4a_flush_dcache_page(void *arg)
+{
+    struct page *page = arg;
+    struct address_space *mapping = page_mapping(page);
+
+#ifndef CONFIG_SMP
+    if (mapping && !mapping_mapped(mapping))
+        set_bit(PG_dcache_dirty, &page->flags);
+    else
+#endif
+    {
+        sh4a_purge_dcache(page_address(page), PAGE_SIZE);
+        sh4a_invalidate_icache(page_address(page), PAGE_SIZE);
+    }
+}
+
+
+/*
+ * SH-4 has virtually indexed and physically tagged cache.
+ */
+void __init sh4a_cache_init(void)
+{
+    printk("SH4A cache optimization\n");
+
+    local_flush_icache_range    = sh4a_flush_icache_range;
+    /* Not sure about alias cases - not checked yet */
+    if (boot_cpu_data.dcache.n_aliases == 0) {
+        local_flush_dcache_page        = sh4a_flush_dcache_page;
+    }
+
+}
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
index 4aa9260..72904d9 100644
--- a/arch/sh/mm/cache.c
+++ b/arch/sh/mm/cache.c
@@ -310,6 +310,12 @@  void __init cpu_cache_init(void)
         extern void __weak sh4_cache_init(void);
 
         sh4_cache_init();
+
+        if(boot_cpu_data.family == CPU_FAMILY_SH4A) {
+            extern void __weak sh4a_cache_init(void);
+
+            sh4a_cache_init();
+        }
     }
 
     if (boot_cpu_data.family == CPU_FAMILY_SH5) {