diff mbox

[RFC,3/4] tests/tcg: Add mttcg ARM64 litmus tests

Message ID 20160805060327.1464-4-bobby.prani@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Pranith Kumar Aug. 5, 2016, 6:03 a.m. UTC
Add a few ARM64 litmus tests.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 tests/tcg/mttcg/aarch64/ARMARM00.c       |  501 +++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM01.c       |  504 +++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM02.c       |  571 +++++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM03.c       |  498 +++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM04+BIS.c   |  556 +++++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM04+TER.c   |  538 ++++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM04.c       |  556 +++++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM05.c       |  553 ++++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM06+AP+AA.c |  581 +++++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM06+AP+AP.c |  581 +++++++++++++++
 tests/tcg/mttcg/aarch64/ARMARM06.c       |  581 +++++++++++++++
 tests/tcg/mttcg/aarch64/Makefile         |   52 ++
 tests/tcg/mttcg/aarch64/README.txt       |   22 +
 tests/tcg/mttcg/aarch64/affinity.c       |  159 +++++
 tests/tcg/mttcg/aarch64/affinity.h       |   34 +
 tests/tcg/mttcg/aarch64/comp.sh          |   30 +
 tests/tcg/mttcg/aarch64/litmus_rand.c    |   64 ++
 tests/tcg/mttcg/aarch64/litmus_rand.h    |   29 +
 tests/tcg/mttcg/aarch64/outs.c           |  148 ++++
 tests/tcg/mttcg/aarch64/outs.h           |   49 ++
 tests/tcg/mttcg/aarch64/run.sh           |  351 +++++++++
 tests/tcg/mttcg/aarch64/show.awk         |    2 +
 tests/tcg/mttcg/aarch64/utils.c          | 1148 ++++++++++++++++++++++++++++++
 tests/tcg/mttcg/aarch64/utils.h          |  275 +++++++
 24 files changed, 8383 insertions(+)
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM00.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM01.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM02.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM03.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM04+BIS.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM04+TER.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM04.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM05.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM06+AP+AA.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM06+AP+AP.c
 create mode 100644 tests/tcg/mttcg/aarch64/ARMARM06.c
 create mode 100644 tests/tcg/mttcg/aarch64/Makefile
 create mode 100644 tests/tcg/mttcg/aarch64/README.txt
 create mode 100644 tests/tcg/mttcg/aarch64/affinity.c
 create mode 100644 tests/tcg/mttcg/aarch64/affinity.h
 create mode 100644 tests/tcg/mttcg/aarch64/comp.sh
 create mode 100644 tests/tcg/mttcg/aarch64/litmus_rand.c
 create mode 100644 tests/tcg/mttcg/aarch64/litmus_rand.h
 create mode 100644 tests/tcg/mttcg/aarch64/outs.c
 create mode 100644 tests/tcg/mttcg/aarch64/outs.h
 create mode 100755 tests/tcg/mttcg/aarch64/run.sh
 create mode 100644 tests/tcg/mttcg/aarch64/show.awk
 create mode 100644 tests/tcg/mttcg/aarch64/utils.c
 create mode 100644 tests/tcg/mttcg/aarch64/utils.h
diff mbox

Patch

diff --git a/tests/tcg/mttcg/aarch64/ARMARM00.c b/tests/tcg/mttcg/aarch64/ARMARM00.c
new file mode 100644
index 0000000..3664ef2
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM00.c
@@ -0,0 +1,501 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 2
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_1_x2;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_1_x2) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_1_x2) {
+    case 0:
+      return 1;
+    default:
+      return 0;
+    }
+  default:
+    return 0;
+  }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 2
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_1_x2_f = 1 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 1:X2=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_1_x2_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_1_x2 = malloc_check(size_of_test*sizeof(*(_a->out_1_x2)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_1_x2);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_1_x2[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM00, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM00, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM00");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"str %w[x0],[%[x1]]\n"
+"#_litmus_P0_2\n\t"
+"mov %w[x2],#1\n"
+"#_litmus_P0_3\n\t"
+"stlr %w[x2],[%[x3]]\n"
+"#END _litmus_P0\n\t"
+:[x2] "=&r" (trashed_x2),[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM00");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  int *out_1_x2 = _a->out_1_x2;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P1_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P1_3\n\t"
+"ldr %w[x2],[%[x3]]\n"
+"#_litmus_P1_4\n"
+"0:\n"
+"#END _litmus_P1\n\t"
+:[x2] "=&r" (out_1_x2[_i]),[x0] "=&r" (out_1_x0[_i])
+:[x1] "r" (&_a->y[_i]),"[x2]" (-1),[x3] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_1_x2_i = ctx.out_1_x2[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_1_x2_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_1_x2_f] = _out_1_x2_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM00 Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 1:X2=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=8b05db686103708c048891dddc96e8bd\n");
+  fprintf(out,"Com=Rf Fr\n");
+  fprintf(out,"Orig=PodWWPL RfeLA PodRRAP Fre\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM00 %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM00 %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM00: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM00, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM00, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM01.c b/tests/tcg/mttcg/aarch64/ARMARM01.c
new file mode 100644
index 0000000..4320bb0
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM01.c
@@ -0,0 +1,504 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 2
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_1_x3;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_1_x3) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_1_x3) {
+    case 0:
+      return 1;
+    default:
+      return 0;
+    }
+  default:
+    return 0;
+  }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 2
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_1_x3_f = 1 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 1:X3=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_1_x3_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_1_x3 = malloc_check(size_of_test*sizeof(*(_a->out_1_x3)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_1_x3);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_1_x3[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM01, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM01, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM01");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"str %w[x0],[%[x1]]\n"
+"#_litmus_P0_2\n\t"
+"mov %w[x2],#1\n"
+"#_litmus_P0_3\n\t"
+"stlr %w[x2],[%[x3]]\n"
+"#END _litmus_P0\n\t"
+:[x2] "=&r" (trashed_x2),[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM01");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  int *out_1_x3 = _a->out_1_x3;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldr %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P1_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P1_3\n\t"
+"eor %w[x2],%w[x0],%w[x0]\n"
+"#_litmus_P1_4\n\t"
+"ldr %w[x3],[%[x4],%w[x2],sxtw]\n"
+"#_litmus_P1_5\n"
+"0:\n"
+"#END _litmus_P1\n\t"
+:[x3] "=&r" (out_1_x3[_i]),[x0] "=&r" (out_1_x0[_i]),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->y[_i]),"[x3]" (-1),[x4] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_1_x3_i = ctx.out_1_x3[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_1_x3_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_1_x3_f] = _out_1_x3_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM01 Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 1:X3=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=e5d401d98b0063559060cb31236f54de\n");
+  fprintf(out,"Com=Rf Fr\n");
+  fprintf(out,"Orig=PodWWPL RfeLP DpAddrdR Fre\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM01 %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM01 %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM01: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM01, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM01, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM02.c b/tests/tcg/mttcg/aarch64/ARMARM02.c
new file mode 100644
index 0000000..0846b53
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM02.c
@@ -0,0 +1,571 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 3
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_1_x3;
+  int *out_2_x0;
+  int *out_2_x3;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_1_x3,int _out_2_x0,int _out_2_x3) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_1_x3) {
+    case 0:
+      return 1;
+    default:
+      goto label00;
+    }
+  default:
+    goto label00;
+  }
+  label00: /* occs=2 */ 
+    switch (_out_2_x0) {
+    case 1:
+      switch (_out_2_x3) {
+      case 0:
+        return 1;
+      default:
+        return 0;
+      }
+    default:
+      return 0;
+    }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 4
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_1_x3_f = 1 ;
+static const int out_2_x0_f = 2 ;
+static const int out_2_x3_f = 3 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 1:X3=%i; 2:X0=%i; 2:X3=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_1_x3_f],(int)o[out_2_x0_f],(int)o[out_2_x3_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_1_x3 = malloc_check(size_of_test*sizeof(*(_a->out_1_x3)));
+  _a->out_2_x0 = malloc_check(size_of_test*sizeof(*(_a->out_2_x0)));
+  _a->out_2_x3 = malloc_check(size_of_test*sizeof(*(_a->out_2_x3)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_1_x3);
+  free((void *)_a->out_2_x0);
+  free((void *)_a->out_2_x3);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_1_x3[_i] = -239487;
+    _a->out_2_x0[_i] = -239487;
+    _a->out_2_x3[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM02, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM02, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM02");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"str %w[x0],[%[x1]]\n"
+"#_litmus_P0_2\n\t"
+"mov %w[x2],#1\n"
+"#_litmus_P0_3\n\t"
+"stlr %w[x2],[%[x3]]\n"
+"#END _litmus_P0\n\t"
+:[x2] "=&r" (trashed_x2),[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM02");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  int *out_1_x3 = _a->out_1_x3;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldr %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P1_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P1_3\n\t"
+"eor %w[x2],%w[x0],%w[x0]\n"
+"#_litmus_P1_4\n\t"
+"ldr %w[x3],[%[x4],%w[x2],sxtw]\n"
+"#_litmus_P1_5\n"
+"0:\n"
+"#END _litmus_P1\n\t"
+:[x3] "=&r" (out_1_x3[_i]),[x0] "=&r" (out_1_x0[_i]),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->y[_i]),"[x3]" (-1),[x4] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P2(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM02");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_2_x0 = _a->out_2_x0;
+  int *out_2_x3 = _a->out_2_x3;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P2\n"
+"#_litmus_P2_0\n\t"
+"ldr %w[x0],[%[x1]]\n"
+"#_litmus_P2_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P2_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P2_3\n\t"
+"eor %w[x2],%w[x0],%w[x0]\n"
+"#_litmus_P2_4\n\t"
+"ldr %w[x3],[%[x4],%w[x2],sxtw]\n"
+"#_litmus_P2_5\n"
+"0:\n"
+"#END _litmus_P2\n\t"
+:[x3] "=&r" (out_2_x3[_i]),[x0] "=&r" (out_2_x0[_i]),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->y[_i]),"[x3]" (-1),[x4] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1,&P2};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_1_x3_i = ctx.out_1_x3[_i];
+      int _out_2_x0_i = ctx.out_2_x0[_i];
+      int _out_2_x3_i = ctx.out_2_x3[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_1_x3_i,_out_2_x0_i,_out_2_x3_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_1_x3_f] = _out_1_x3_i;
+      o[out_2_x0_f] = _out_2_x0_i;
+      o[out_2_x3_f] = _out_2_x3_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM02 Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 1:X3=0 \\/ 2:X0=1 /\\ 2:X3=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=331667937512d41933d43e7fe662a69e\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM02 %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM02 %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM02: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM02, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM02, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM03.c b/tests/tcg/mttcg/aarch64/ARMARM03.c
new file mode 100644
index 0000000..cc7ef33
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM03.c
@@ -0,0 +1,498 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 2
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *z;
+  int* *y;
+  int *x;
+/* Final content of observed  registers */
+  int* *out_1_x0;
+  int *out_1_x9;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int* _out_1_x0,int _out_1_x9,void *_val_x) {
+  return (_out_1_x0 == _val_x) && (_out_1_x9 == 0);
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+static int idx_addr(ctx_t *_a,int _i,void *v_addr) {
+  if (v_addr == NULL) { return 0;}
+  else if (v_addr == (void *)&(_a->z[_i])) return 1;
+  else if (v_addr == (void *)&(_a->y[_i])) return 2;
+  else if (v_addr == (void *)&(_a->x[_i])) return 3;
+  else { fatal("ARMARM03, ???"); return -1;}
+}
+
+static char *pretty_addr[4] = {"0","z","y","x",};
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 2
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_1_x9_f = 1 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%s; 1:X9=%i;\n",c,show ? '*' : ':',pretty_addr[o[out_1_x0_f]],(int)o[out_1_x9_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_1_x9 = malloc_check(size_of_test*sizeof(*(_a->out_1_x9)));
+  _a->z = malloc_check(size_of_test*sizeof(*(_a->z)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->z);
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_1_x9);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->z[_i] = -1;
+    _a->y[_i] = ((int *)&(_a->z[_i]));
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = NULL;
+    _a->out_1_x9[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *z = _a->z;
+  int* *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && z[_i] != -1) fatal("ARMARM03, check_globals failed");
+    if (rand_bit(&(_a->seed)) && y[_i] != ((int *)&(_a->z[_i]))) fatal("ARMARM03, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM03, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM03");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"str %w[x0],[%[x1]]\n"
+"#_litmus_P0_2\n\t"
+"stlr %[x1],[%[x3]]\n"
+"#END _litmus_P0\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM03");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int* *out_1_x0 = _a->out_1_x0;
+  int *out_1_x9 = _a->out_1_x9;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldr %[x0],[%[x3]]\n"
+"#_litmus_P1_1\n\t"
+"ldr %w[x9],[%[x0]]\n"
+"#END _litmus_P1\n\t"
+:[x9] "=&r" (out_1_x9[_i]),[x0] "=&r" (out_1_x0[_i])
+:[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int* _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_1_x9_i = ctx.out_1_x9[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_1_x9_i,&(ctx.x[_i])));
+      o[out_1_x0_f] = idx_addr(&ctx,_i,_out_1_x0_i);
+      o[out_1_x9_f] = _out_1_x9_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM03 Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=x /\\ 1:X9=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=5576fb6a46ae70aa15a1087a51748a39\n");
+  fprintf(out,"Com=Rf Fr\n");
+  fprintf(out,"Orig=PodWWPL RfeLP DpAddrdR Fre\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM03 %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM03 %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM03: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM03, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM03, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM04+BIS.c b/tests/tcg/mttcg/aarch64/ARMARM04+BIS.c
new file mode 100644
index 0000000..41a2bc4
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM04+BIS.c
@@ -0,0 +1,556 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 3
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_2_x0;
+  int *out_2_x3;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_2_x0,int _out_2_x3) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_2_x0) {
+    case 1:
+      switch (_out_2_x3) {
+      case 0:
+        return 1;
+      default:
+        return 0;
+      }
+    default:
+      return 0;
+    }
+  default:
+    return 0;
+  }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 3
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_2_x0_f = 1 ;
+static const int out_2_x3_f = 2 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 2:X0=%i; 2:X3=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_2_x0_f],(int)o[out_2_x3_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_2_x0 = malloc_check(size_of_test*sizeof(*(_a->out_2_x0)));
+  _a->out_2_x3 = malloc_check(size_of_test*sizeof(*(_a->out_2_x3)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_2_x0);
+  free((void *)_a->out_2_x3);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_2_x0[_i] = -239487;
+    _a->out_2_x3[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM04+BIS, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM04+BIS, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM04+BIS");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"stlr %w[x0],[%[x1]]\n"
+"#END _litmus_P0\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM04+BIS");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P1_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P1_3\n\t"
+"mov %w[x2],#1\n"
+"#_litmus_P1_4\n\t"
+"str %w[x2],[%[x3]]\n"
+"#_litmus_P1_5\n"
+"0:\n"
+"#END _litmus_P1\n\t"
+:[x0] "=&r" (out_1_x0[_i]),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P2(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM04+BIS");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_2_x0 = _a->out_2_x0;
+  int *out_2_x3 = _a->out_2_x3;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+      void* trashed_x5;
+asm __volatile__ (
+"\n"
+"#START _litmus_P2\n"
+"#_litmus_P2_0\n\t"
+"ldr %w[x0],[%[x1]]\n"
+"#_litmus_P2_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P2_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P2_3\n\t"
+"eor %w[x2],%w[x0],%w[x0]\n"
+"#_litmus_P2_4\n\t"
+"add %[x5],%[x4],%w[x2],sxtw\n"
+"#_litmus_P2_5\n\t"
+"ldr %w[x3],[%[x5]]\n"
+"#_litmus_P2_6\n"
+"0:\n"
+"#END _litmus_P2\n\t"
+:[x3] "=&r" (out_2_x3[_i]),[x0] "=&r" (out_2_x0[_i]),[x5] "=&r" (trashed_x5),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->y[_i]),"[x3]" (-1),[x4] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1,&P2};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_2_x0_i = ctx.out_2_x0[_i];
+      int _out_2_x3_i = ctx.out_2_x3[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_2_x0_i,_out_2_x3_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_2_x0_f] = _out_2_x0_i;
+      o[out_2_x3_f] = _out_2_x3_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM04+BIS Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 2:X0=1 /\\ 2:X3=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=cc1a451a55f22a8237344520956968ae\n");
+  fprintf(out,"Com=Rf Rf Fr\n");
+  fprintf(out,"Orig=RfeLA PodRWAP Rfe DpAddrdRPA FreAL\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM04+BIS %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM04+BIS %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM04+BIS: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM04+BIS, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM04+BIS, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM04+TER.c b/tests/tcg/mttcg/aarch64/ARMARM04+TER.c
new file mode 100644
index 0000000..46dcc3a
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM04+TER.c
@@ -0,0 +1,538 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 3
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_2_x0;
+  int *out_2_x2;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_2_x0,int _out_2_x2) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_2_x0) {
+    case 1:
+      switch (_out_2_x2) {
+      case 0:
+        return 1;
+      default:
+        return 0;
+      }
+    default:
+      return 0;
+    }
+  default:
+    return 0;
+  }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 3
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_2_x0_f = 1 ;
+static const int out_2_x2_f = 2 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 2:X0=%i; 2:X2=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_2_x0_f],(int)o[out_2_x2_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_2_x0 = malloc_check(size_of_test*sizeof(*(_a->out_2_x0)));
+  _a->out_2_x2 = malloc_check(size_of_test*sizeof(*(_a->out_2_x2)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_2_x0);
+  free((void *)_a->out_2_x2);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_2_x0[_i] = -239487;
+    _a->out_2_x2[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM04+TER, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM04+TER, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM04+TER");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"stlr %w[x0],[%[x1]]\n"
+"#END _litmus_P0\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM04+TER");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"mov %w[x2],#1\n"
+"#_litmus_P1_2\n\t"
+"str %w[x2],[%[x3]]\n"
+"#END _litmus_P1\n\t"
+:[x0] "=&r" (out_1_x0[_i]),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P2(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM04+TER");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_2_x0 = _a->out_2_x0;
+  int *out_2_x2 = _a->out_2_x2;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+asm __volatile__ (
+"\n"
+"#START _litmus_P2\n"
+"#_litmus_P2_0\n\t"
+"ldr %w[x0],[%[x1]]\n"
+"#_litmus_P2_1\n\t"
+"ldar %w[x2],[%[x3]]\n"
+"#END _litmus_P2\n\t"
+:[x2] "=&r" (out_2_x2[_i]),[x0] "=&r" (out_2_x0[_i])
+:[x1] "r" (&_a->y[_i]),[x3] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1,&P2};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_2_x0_i = ctx.out_2_x0[_i];
+      int _out_2_x2_i = ctx.out_2_x2[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_2_x0_i,_out_2_x2_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_2_x0_f] = _out_2_x0_i;
+      o[out_2_x2_f] = _out_2_x2_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM04+TER Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 2:X0=1 /\\ 2:X2=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=b4e4b307db93ea419e3d73d315f6593d\n");
+  fprintf(out,"Com=Rf Rf Fr\n");
+  fprintf(out,"Orig=RfeLA PodRWAP Rfe PodRRPA FreAL\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM04+TER %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM04+TER %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM04+TER: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM04+TER, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM04+TER, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM04.c b/tests/tcg/mttcg/aarch64/ARMARM04.c
new file mode 100644
index 0000000..c514bcd
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM04.c
@@ -0,0 +1,556 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 3
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_2_x0;
+  int *out_2_x3;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_2_x0,int _out_2_x3) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_2_x0) {
+    case 1:
+      switch (_out_2_x3) {
+      case 0:
+        return 1;
+      default:
+        return 0;
+      }
+    default:
+      return 0;
+    }
+  default:
+    return 0;
+  }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 3
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_2_x0_f = 1 ;
+static const int out_2_x3_f = 2 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 2:X0=%i; 2:X3=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_2_x0_f],(int)o[out_2_x3_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_2_x0 = malloc_check(size_of_test*sizeof(*(_a->out_2_x0)));
+  _a->out_2_x3 = malloc_check(size_of_test*sizeof(*(_a->out_2_x3)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_2_x0);
+  free((void *)_a->out_2_x3);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_2_x0[_i] = -239487;
+    _a->out_2_x3[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM04, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM04, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM04");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"stlr %w[x0],[%[x1]]\n"
+"#END _litmus_P0\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM04");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P1_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P1_3\n\t"
+"mov %w[x2],#1\n"
+"#_litmus_P1_4\n\t"
+"str %w[x2],[%[x3]]\n"
+"#_litmus_P1_5\n"
+"0:\n"
+"#END _litmus_P1\n\t"
+:[x0] "=&r" (out_1_x0[_i]),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P2(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM04");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_2_x0 = _a->out_2_x0;
+  int *out_2_x3 = _a->out_2_x3;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+      void* trashed_x5;
+asm __volatile__ (
+"\n"
+"#START _litmus_P2\n"
+"#_litmus_P2_0\n\t"
+"ldr %w[x0],[%[x1]]\n"
+"#_litmus_P2_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P2_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P2_3\n\t"
+"eor %w[x2],%w[x0],%w[x0]\n"
+"#_litmus_P2_4\n\t"
+"add %[x5],%[x4],%w[x2],sxtw\n"
+"#_litmus_P2_5\n\t"
+"ldar %w[x3],[%[x5]]\n"
+"#_litmus_P2_6\n"
+"0:\n"
+"#END _litmus_P2\n\t"
+:[x3] "=&r" (out_2_x3[_i]),[x0] "=&r" (out_2_x0[_i]),[x5] "=&r" (trashed_x5),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->y[_i]),"[x3]" (-1),[x4] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1,&P2};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_2_x0_i = ctx.out_2_x0[_i];
+      int _out_2_x3_i = ctx.out_2_x3[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_2_x0_i,_out_2_x3_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_2_x0_f] = _out_2_x0_i;
+      o[out_2_x3_f] = _out_2_x3_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM04 Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 2:X0=1 /\\ 2:X3=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=bd6e92e9864bd5671cb7e479e9221bae\n");
+  fprintf(out,"Com=Rf Rf Fr\n");
+  fprintf(out,"Orig=RfeLA PodRWAP Rfe DpAddrdRPA FreAL\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM04 %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM04 %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM04: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM04, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM04, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM05.c b/tests/tcg/mttcg/aarch64/ARMARM05.c
new file mode 100644
index 0000000..3b80ddc
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM05.c
@@ -0,0 +1,553 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 3
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_2_x0;
+  int *out_2_x3;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_2_x0,int _out_2_x3) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_2_x0) {
+    case 1:
+      switch (_out_2_x3) {
+      case 0:
+        return 1;
+      default:
+        return 0;
+      }
+    default:
+      return 0;
+    }
+  default:
+    return 0;
+  }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 3
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_2_x0_f = 1 ;
+static const int out_2_x3_f = 2 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 2:X0=%i; 2:X3=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_2_x0_f],(int)o[out_2_x3_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_2_x0 = malloc_check(size_of_test*sizeof(*(_a->out_2_x0)));
+  _a->out_2_x3 = malloc_check(size_of_test*sizeof(*(_a->out_2_x3)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_2_x0);
+  free((void *)_a->out_2_x3);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_2_x0[_i] = -239487;
+    _a->out_2_x3[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM05, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM05, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM05");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"str %w[x0],[%[x1]]\n"
+"#END _litmus_P0\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM05");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldr %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P1_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P1_3\n\t"
+"mov %w[x2],#1\n"
+"#_litmus_P1_4\n\t"
+"stlr %w[x2],[%[x3]]\n"
+"#_litmus_P1_5\n"
+"0:\n"
+"#END _litmus_P1\n\t"
+:[x0] "=&r" (out_1_x0[_i]),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P2(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM05");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_2_x0 = _a->out_2_x0;
+  int *out_2_x3 = _a->out_2_x3;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x2;
+asm __volatile__ (
+"\n"
+"#START _litmus_P2\n"
+"#_litmus_P2_0\n\t"
+"ldr %w[x0],[%[x1]]\n"
+"#_litmus_P2_1\n\t"
+"cmp %w[x0],#1\n"
+"#_litmus_P2_2\n\t"
+"b.ne 0f\n"
+"#_litmus_P2_3\n\t"
+"eor %w[x2],%w[x0],%w[x0]\n"
+"#_litmus_P2_4\n\t"
+"ldr %w[x3],[%[x4],%w[x2],sxtw]\n"
+"#_litmus_P2_5\n"
+"0:\n"
+"#END _litmus_P2\n\t"
+:[x3] "=&r" (out_2_x3[_i]),[x0] "=&r" (out_2_x0[_i]),[x2] "=&r" (trashed_x2)
+:[x1] "r" (&_a->y[_i]),"[x3]" (-1),[x4] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1,&P2};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_2_x0_i = ctx.out_2_x0[_i];
+      int _out_2_x3_i = ctx.out_2_x3[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_2_x0_i,_out_2_x3_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_2_x0_f] = _out_2_x0_i;
+      o[out_2_x3_f] = _out_2_x3_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM05 Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 2:X0=1 /\\ 2:X3=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=87598a116e95069f91ea38314a500157\n");
+  fprintf(out,"Com=Rf Rf Fr\n");
+  fprintf(out,"Orig=Rfe PodRWPL RfeLP DpAddrdR Fre\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM05 %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM05 %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM05: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM05, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM05, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM06+AP+AA.c b/tests/tcg/mttcg/aarch64/ARMARM06+AP+AA.c
new file mode 100644
index 0000000..c44721e
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM06+AP+AA.c
@@ -0,0 +1,581 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 4
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_1_x2;
+  int *out_3_x0;
+  int *out_3_x2;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_1_x2,int _out_3_x0,int _out_3_x2) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_1_x2) {
+    case 0:
+      switch (_out_3_x0) {
+      case 1:
+        switch (_out_3_x2) {
+        case 0:
+          return 1;
+        default:
+          return 0;
+        }
+      default:
+        return 0;
+      }
+    default:
+      return 0;
+    }
+  default:
+    return 0;
+  }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 4
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_1_x2_f = 1 ;
+static const int out_3_x0_f = 2 ;
+static const int out_3_x2_f = 3 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 1:X2=%i; 3:X0=%i; 3:X2=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_1_x2_f],(int)o[out_3_x0_f],(int)o[out_3_x2_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_1_x2 = malloc_check(size_of_test*sizeof(*(_a->out_1_x2)));
+  _a->out_3_x0 = malloc_check(size_of_test*sizeof(*(_a->out_3_x0)));
+  _a->out_3_x2 = malloc_check(size_of_test*sizeof(*(_a->out_3_x2)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_1_x2);
+  free((void *)_a->out_3_x0);
+  free((void *)_a->out_3_x2);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_1_x2[_i] = -239487;
+    _a->out_3_x0[_i] = -239487;
+    _a->out_3_x2[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM06+AP+AA, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM06+AP+AA, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06+AP+AA");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"stlr %w[x0],[%[x1]]\n"
+"#END _litmus_P0\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06+AP+AA");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  int *out_1_x2 = _a->out_1_x2;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"ldr %w[x2],[%[x3]]\n"
+"#END _litmus_P1\n\t"
+:[x2] "=&r" (out_1_x2[_i]),[x0] "=&r" (out_1_x0[_i])
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P2(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06+AP+AA");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P2\n"
+"#_litmus_P2_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P2_1\n\t"
+"stlr %w[x0],[%[x1]]\n"
+"#END _litmus_P2\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P3(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06+AP+AA");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_3_x0 = _a->out_3_x0;
+  int *out_3_x2 = _a->out_3_x2;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+asm __volatile__ (
+"\n"
+"#START _litmus_P3\n"
+"#_litmus_P3_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P3_1\n\t"
+"ldar %w[x2],[%[x3]]\n"
+"#END _litmus_P3\n\t"
+:[x2] "=&r" (out_3_x2[_i]),[x0] "=&r" (out_3_x0[_i])
+:[x1] "r" (&_a->y[_i]),[x3] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1,&P2,&P3};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_1_x2_i = ctx.out_1_x2[_i];
+      int _out_3_x0_i = ctx.out_3_x0[_i];
+      int _out_3_x2_i = ctx.out_3_x2[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_1_x2_i,_out_3_x0_i,_out_3_x2_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_1_x2_f] = _out_1_x2_i;
+      o[out_3_x0_f] = _out_3_x0_i;
+      o[out_3_x2_f] = _out_3_x2_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM06+AP+AA Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 1:X2=0 /\\ 3:X0=1 /\\ 3:X2=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=09790f7c95472c70e7c2cd1b8b1f7c97\n");
+  fprintf(out,"Com=Rf Fr Rf Fr\n");
+  fprintf(out,"Orig=RfeLA PodRRAP FrePL RfeLA PodRRAA FreAL\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM06+AP+AA %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM06+AP+AA %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM06+AP+AA: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM06+AP+AA, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM06+AP+AA, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM06+AP+AP.c b/tests/tcg/mttcg/aarch64/ARMARM06+AP+AP.c
new file mode 100644
index 0000000..7389809
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM06+AP+AP.c
@@ -0,0 +1,581 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 4
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_1_x2;
+  int *out_3_x0;
+  int *out_3_x2;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_1_x2,int _out_3_x0,int _out_3_x2) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_1_x2) {
+    case 0:
+      switch (_out_3_x0) {
+      case 1:
+        switch (_out_3_x2) {
+        case 0:
+          return 1;
+        default:
+          return 0;
+        }
+      default:
+        return 0;
+      }
+    default:
+      return 0;
+    }
+  default:
+    return 0;
+  }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 4
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_1_x2_f = 1 ;
+static const int out_3_x0_f = 2 ;
+static const int out_3_x2_f = 3 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 1:X2=%i; 3:X0=%i; 3:X2=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_1_x2_f],(int)o[out_3_x0_f],(int)o[out_3_x2_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_1_x2 = malloc_check(size_of_test*sizeof(*(_a->out_1_x2)));
+  _a->out_3_x0 = malloc_check(size_of_test*sizeof(*(_a->out_3_x0)));
+  _a->out_3_x2 = malloc_check(size_of_test*sizeof(*(_a->out_3_x2)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_1_x2);
+  free((void *)_a->out_3_x0);
+  free((void *)_a->out_3_x2);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_1_x2[_i] = -239487;
+    _a->out_3_x0[_i] = -239487;
+    _a->out_3_x2[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM06+AP+AP, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM06+AP+AP, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06+AP+AP");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"stlr %w[x0],[%[x1]]\n"
+"#END _litmus_P0\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06+AP+AP");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  int *out_1_x2 = _a->out_1_x2;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"ldr %w[x2],[%[x3]]\n"
+"#END _litmus_P1\n\t"
+:[x2] "=&r" (out_1_x2[_i]),[x0] "=&r" (out_1_x0[_i])
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P2(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06+AP+AP");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P2\n"
+"#_litmus_P2_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P2_1\n\t"
+"stlr %w[x0],[%[x1]]\n"
+"#END _litmus_P2\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P3(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06+AP+AP");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_3_x0 = _a->out_3_x0;
+  int *out_3_x2 = _a->out_3_x2;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+asm __volatile__ (
+"\n"
+"#START _litmus_P3\n"
+"#_litmus_P3_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P3_1\n\t"
+"ldr %w[x2],[%[x3]]\n"
+"#END _litmus_P3\n\t"
+:[x2] "=&r" (out_3_x2[_i]),[x0] "=&r" (out_3_x0[_i])
+:[x1] "r" (&_a->y[_i]),[x3] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1,&P2,&P3};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_1_x2_i = ctx.out_1_x2[_i];
+      int _out_3_x0_i = ctx.out_3_x0[_i];
+      int _out_3_x2_i = ctx.out_3_x2[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_1_x2_i,_out_3_x0_i,_out_3_x2_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_1_x2_f] = _out_1_x2_i;
+      o[out_3_x0_f] = _out_3_x0_i;
+      o[out_3_x2_f] = _out_3_x2_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM06+AP+AP Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 1:X2=0 /\\ 3:X0=1 /\\ 3:X2=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=73c88d83e9bc423599f9750ed7d77ac2\n");
+  fprintf(out,"Com=Rf Fr Rf Fr\n");
+  fprintf(out,"Orig=RfeLA PodRRAP FrePL RfeLA PodRRAP FrePL\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM06+AP+AP %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM06+AP+AP %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM06+AP+AP: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM06+AP+AP, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM06+AP+AP, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/ARMARM06.c b/tests/tcg/mttcg/aarch64/ARMARM06.c
new file mode 100644
index 0000000..a2fa38e
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/ARMARM06.c
@@ -0,0 +1,581 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* This C source is a product of litmus7 and includes source that is        */
+/* governed by the CeCILL-B license.                                        */
+/****************************************************************************/
+/* Parameters */
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+#define N 4
+#define AFF_INCR (0)
+/* Includes */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <time.h>
+#include <limits.h>
+#include "utils.h"
+#include "outs.h"
+#include "affinity.h"
+
+/* params */
+typedef struct {
+  int verbose;
+  int size_of_test,max_run;
+  int stride;
+  aff_mode_t aff_mode;
+  int ncpus, ncpus_used;
+  int do_change;
+} param_t;
+
+
+/* Full memory barrier */
+inline static void mbar(void) {
+  asm __volatile__ ("dsb sy" ::: "memory");
+}
+
+/* Barriers macros */
+inline static void barrier_wait(unsigned int id, unsigned int k, int volatile *b) {
+  if ((k % N) == id) {
+    *b = 1 ;
+  } else {
+    while (*b == 0) ;
+  }
+}
+
+/**********************/
+/* Context definition */
+/**********************/
+
+
+typedef struct {
+/* Shared variables */
+  int *y;
+  int *x;
+/* Final content of observed  registers */
+  int *out_1_x0;
+  int *out_1_x2;
+  int *out_3_x0;
+  int *out_3_x2;
+/* Check data */
+  pb_t *fst_barrier;
+/* Barrier for litmus loop */
+  int volatile *barrier;
+/* Instance seed */
+  st_t seed;
+/* Parameters */
+  param_t *_p;
+} ctx_t;
+
+inline static int final_cond(int _out_1_x0,int _out_1_x2,int _out_3_x0,int _out_3_x2) {
+  switch (_out_1_x0) {
+  case 1:
+    switch (_out_1_x2) {
+    case 0:
+      switch (_out_3_x0) {
+      case 1:
+        switch (_out_3_x2) {
+        case 0:
+          return 1;
+        default:
+          return 0;
+        }
+      default:
+        return 0;
+      }
+    default:
+      return 0;
+    }
+  default:
+    return 0;
+  }
+}
+
+inline static int final_ok(int cond) {
+  return cond;
+}
+
+/**********************/
+/* Outcome collection */
+/**********************/
+#define NOUTS 4
+typedef intmax_t outcome_t[NOUTS];
+
+static const int out_1_x0_f = 0 ;
+static const int out_1_x2_f = 1 ;
+static const int out_3_x0_f = 2 ;
+static const int out_3_x2_f = 3 ;
+
+
+typedef struct hist_t {
+  outs_t *outcomes ;
+  count_t n_pos,n_neg ;
+} hist_t ;
+
+static hist_t *alloc_hist(void) {
+  hist_t *p = malloc_check(sizeof(*p)) ;
+  p->outcomes = NULL ;
+  p->n_pos = p->n_neg = 0 ;
+  return p ;
+}
+
+static void free_hist(hist_t *h) {
+  free_outs(h->outcomes) ;
+  free(h) ;
+}
+
+static void add_outcome(hist_t *h, count_t v, outcome_t o, int show) {
+  h->outcomes = add_outcome_outs(h->outcomes,o,NOUTS,v,show) ;
+}
+
+static void merge_hists(hist_t *h0, hist_t *h1) {
+  h0->n_pos += h1->n_pos ;
+  h0->n_neg += h1->n_neg ;
+  h0->outcomes = merge_outs(h0->outcomes,h1->outcomes,NOUTS) ;
+}
+
+static count_t sum_hist(hist_t *h) {
+  return sum_outs(h->outcomes) ;
+}
+
+
+static void do_dump_outcome(FILE *fhist, intmax_t *o, count_t c, int show) {
+  fprintf(fhist,"%-6"PCTR"%c>1:X0=%i; 1:X2=%i; 3:X0=%i; 3:X2=%i;\n",c,show ? '*' : ':',(int)o[out_1_x0_f],(int)o[out_1_x2_f],(int)o[out_3_x0_f],(int)o[out_3_x2_f]);
+}
+
+static void just_dump_outcomes(FILE *fhist, hist_t *h) {
+  outcome_t buff ;
+  dump_outs(fhist,do_dump_outcome,h->outcomes,buff,NOUTS) ;
+}
+
+/*******************************************************/
+/* Context allocation, freeing and reinitialization    */
+/*******************************************************/
+
+static void init(ctx_t *_a) {
+  int size_of_test = _a->_p->size_of_test;
+
+  _a->seed = rand();
+  _a->out_1_x0 = malloc_check(size_of_test*sizeof(*(_a->out_1_x0)));
+  _a->out_1_x2 = malloc_check(size_of_test*sizeof(*(_a->out_1_x2)));
+  _a->out_3_x0 = malloc_check(size_of_test*sizeof(*(_a->out_3_x0)));
+  _a->out_3_x2 = malloc_check(size_of_test*sizeof(*(_a->out_3_x2)));
+  _a->y = malloc_check(size_of_test*sizeof(*(_a->y)));
+  _a->x = malloc_check(size_of_test*sizeof(*(_a->x)));
+  _a->fst_barrier = pb_create(N);
+  _a->barrier = malloc_check(size_of_test*sizeof(*(_a->barrier)));
+}
+
+static void finalize(ctx_t *_a) {
+  free((void *)_a->y);
+  free((void *)_a->x);
+  free((void *)_a->out_1_x0);
+  free((void *)_a->out_1_x2);
+  free((void *)_a->out_3_x0);
+  free((void *)_a->out_3_x2);
+  pb_free(_a->fst_barrier);
+  free((void *)_a->barrier);
+}
+
+static void reinit(ctx_t *_a) {
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    _a->y[_i] = 0;
+    _a->x[_i] = 0;
+    _a->out_1_x0[_i] = -239487;
+    _a->out_1_x2[_i] = -239487;
+    _a->out_3_x0[_i] = -239487;
+    _a->out_3_x2[_i] = -239487;
+    _a->barrier[_i] = 0;
+  }
+}
+
+/**************************************/
+/* Prefetch (and check) global values */
+/**************************************/
+
+static void check_globals(ctx_t *_a) {
+  int *y = _a->y;
+  int *x = _a->x;
+  for (int _i = _a->_p->size_of_test-1 ; _i >= 0 ; _i--) {
+    if (rand_bit(&(_a->seed)) && y[_i] != 0) fatal("ARMARM06, check_globals failed");
+    if (rand_bit(&(_a->seed)) && x[_i] != 0) fatal("ARMARM06, check_globals failed");
+  }
+  pb_wait(_a->fst_barrier);
+}
+
+/***************/
+/* Litmus code */
+/***************/
+
+typedef struct {
+  int th_id; /* I am running on this thread */
+  int *cpu; /* On this cpu */
+  ctx_t *_a;   /* In this context */
+} parg_t;
+
+
+
+
+
+static void *P0(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P0\n"
+"#_litmus_P0_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P0_1\n\t"
+"stlr %w[x0],[%[x1]]\n"
+"#END _litmus_P0\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P1(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_1_x0 = _a->out_1_x0;
+  int *out_1_x2 = _a->out_1_x2;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+asm __volatile__ (
+"\n"
+"#START _litmus_P1\n"
+"#_litmus_P1_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P1_1\n\t"
+"ldar %w[x2],[%[x3]]\n"
+"#END _litmus_P1\n\t"
+:[x2] "=&r" (out_1_x2[_i]),[x0] "=&r" (out_1_x0[_i])
+:[x1] "r" (&_a->x[_i]),[x3] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P2(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+      int trashed_x0;
+asm __volatile__ (
+"\n"
+"#START _litmus_P2\n"
+"#_litmus_P2_0\n\t"
+"mov %w[x0],#1\n"
+"#_litmus_P2_1\n\t"
+"stlr %w[x0],[%[x1]]\n"
+"#END _litmus_P2\n\t"
+:[x0] "=&r" (trashed_x0)
+:[x1] "r" (&_a->y[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+static void *P3(void *_vb) {
+  mbar();
+  parg_t *_b = (parg_t *)_vb;
+  ctx_t *_a = _b->_a;
+  int _ecpu = _b->cpu[_b->th_id];
+  force_one_affinity(_ecpu,AVAIL,_a->_p->verbose,"ARMARM06");
+  check_globals(_a);
+  int _th_id = _b->th_id;
+  int volatile *barrier = _a->barrier;
+  int _size_of_test = _a->_p->size_of_test;
+  int _stride = _a->_p->stride;
+  int *out_3_x0 = _a->out_3_x0;
+  int *out_3_x2 = _a->out_3_x2;
+  for (int _j = _stride ; _j > 0 ; _j--) {
+    for (int _i = _size_of_test-_j ; _i >= 0 ; _i -= _stride) {
+      barrier_wait(_th_id,_i,&barrier[_i]);
+asm __volatile__ (
+"\n"
+"#START _litmus_P3\n"
+"#_litmus_P3_0\n\t"
+"ldar %w[x0],[%[x1]]\n"
+"#_litmus_P3_1\n\t"
+"ldar %w[x2],[%[x3]]\n"
+"#END _litmus_P3\n\t"
+:[x2] "=&r" (out_3_x2[_i]),[x0] "=&r" (out_3_x0[_i])
+:[x1] "r" (&_a->y[_i]),[x3] "r" (&_a->x[_i])
+:"cc","memory"
+);
+    }
+  }
+  mbar();
+  return NULL;
+}
+
+typedef struct {
+  pm_t *p_mutex;
+  pb_t *p_barrier;
+  param_t *_p;
+  int z_id;
+  int *cpus;
+} zyva_t;
+
+#define NT N
+
+static void *zyva(void *_va) {
+  zyva_t *_a = (zyva_t *) _va;
+  param_t *_b = _a->_p;
+  pb_wait(_a->p_barrier);
+  pthread_t thread[NT];
+  parg_t parg[N];
+  f_t *fun[] = {&P0,&P1,&P2,&P3};
+  hist_t *hist = alloc_hist();
+  ctx_t ctx;
+  ctx._p = _b;
+
+  init(&ctx);
+  for (int _p = N-1 ; _p >= 0 ; _p--) {
+    parg[_p].th_id = _p; parg[_p]._a = &ctx;
+    parg[_p].cpu = &(_a->cpus[0]);
+  }
+
+  for (int n_run = 0 ; n_run < _b->max_run ; n_run++) {
+    if (_b->aff_mode == aff_random) {
+      pb_wait(_a->p_barrier);
+      if (_a->z_id == 0) perm_prefix_ints(&ctx.seed,_a->cpus,_b->ncpus_used,_b->ncpus);
+      pb_wait(_a->p_barrier);
+    } else {
+    }
+    if (_b->verbose>1) fprintf(stderr,"Run %i of %i\r", n_run, _b->max_run);
+    reinit(&ctx);
+    if (_b->do_change) perm_funs(&ctx.seed,fun,N);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      launch(&thread[_p],fun[_p],&parg[_p]);
+    }
+    if (_b->do_change) perm_threads(&ctx.seed,thread,NT);
+    for (int _p = NT-1 ; _p >= 0 ; _p--) {
+      join(&thread[_p]);
+    }
+    /* Log final states */
+    for (int _i = _b->size_of_test-1 ; _i >= 0 ; _i--) {
+      int _out_1_x0_i = ctx.out_1_x0[_i];
+      int _out_1_x2_i = ctx.out_1_x2[_i];
+      int _out_3_x0_i = ctx.out_3_x0[_i];
+      int _out_3_x2_i = ctx.out_3_x2[_i];
+      outcome_t o;
+      int cond;
+
+      cond = final_ok(final_cond(_out_1_x0_i,_out_1_x2_i,_out_3_x0_i,_out_3_x2_i));
+      o[out_1_x0_f] = _out_1_x0_i;
+      o[out_1_x2_f] = _out_1_x2_i;
+      o[out_3_x0_f] = _out_3_x0_i;
+      o[out_3_x2_f] = _out_3_x2_i;
+      add_outcome(hist,1,o,cond);
+      if (cond) { hist->n_pos++; } else { hist->n_neg++; }
+    }
+  }
+
+  finalize(&ctx);
+  return hist;
+}
+
+#define ENOUGH 10
+
+static int postlude(FILE *out,cmd_t *cmd,hist_t *hist,count_t p_true,count_t p_false,tsc_t total) {
+  fprintf(out,"Test ARMARM06 Forbidden\n");
+  fprintf(out,"Histogram (%i states)\n",finals_outs(hist->outcomes));
+  just_dump_outcomes(out,hist);
+  int cond = p_true == 0;
+  fprintf(out,"%s\n",cond?"Ok":"No");
+  fprintf(out,"\nWitnesses\n");
+  fprintf(out,"Positive: %" PCTR ", Negative: %" PCTR "\n",p_false,p_true);
+  fprintf(out,"Condition ~exists (1:X0=1 /\\ 1:X2=0 /\\ 3:X0=1 /\\ 3:X2=0) is %svalidated\n",cond ? "" : "NOT ");
+  fprintf(out,"Hash=e1d155e0765dad3f8a9955c387076a95\n");
+  fprintf(out,"Com=Rf Fr Rf Fr\n");
+  fprintf(out,"Orig=RfeLA PodRRAA FreAL RfeLA PodRRAA FreAL\n");
+  count_t cond_true = p_true;
+  count_t cond_false = p_false;
+  fprintf(out,"Observation ARMARM06 %s %" PCTR " %" PCTR "\n",!cond_true ? "Never" : !cond_false ? "Always" : "Sometimes",cond_true,cond_false);
+  if (p_true > 0) {
+  }
+  fprintf(out,"Time ARMARM06 %.2f\n",total / 1000000.0);
+  fflush(out);
+  return cond;
+}
+
+static int run(cmd_t *cmd,cpus_t *def_all_cpus,FILE *out) {
+  tsc_t start = timeofday();
+  param_t prm ;
+/* Set some parameters */
+  prm.verbose = cmd->verbose;
+  prm.size_of_test = cmd->size_of_test;
+  prm.max_run = cmd->max_run;
+  prm.stride = cmd->stride;
+  prm.do_change = 1;
+  if (cmd->fix) prm.do_change = 0;
+/* Computes number of test concurrent instances */
+  int n_avail = cmd->avail > 0 ? cmd->avail : cmd->aff_cpus->sz;
+  if (n_avail >  cmd->aff_cpus->sz) log_error("Warning: avail=%i, available=%i\n",n_avail, cmd->aff_cpus->sz);
+  int n_exe;
+  if (cmd->n_exe > 0) {
+    n_exe = cmd->n_exe;
+  } else {
+    n_exe = n_avail < N ? 1 : n_avail / N;
+  }
+/* Set affinity parameters */
+  cpus_t *all_cpus = cmd->aff_cpus;
+  int aff_cpus_sz = cmd->aff_mode == aff_random ? max(all_cpus->sz,N*n_exe) : N*n_exe;
+  int aff_cpus[aff_cpus_sz];
+  prm.aff_mode = cmd->aff_mode;
+  prm.ncpus = aff_cpus_sz;
+  prm.ncpus_used = N*n_exe;
+/* Show parameters to user */
+  if (prm.verbose) {
+    log_error( "ARMARM06: n=%i, r=%i, s=%i",n_exe,prm.max_run,prm.size_of_test);
+    log_error(", st=%i",prm.stride);
+    if (cmd->aff_mode == aff_incr) {
+      log_error( ", i=%i",cmd->aff_incr);
+    } else if (cmd->aff_mode == aff_random) {
+      log_error(", +ra");
+    } else if (cmd->aff_mode == aff_custom) {
+      log_error(", +ca");
+    } else if (cmd->aff_mode == aff_scan) {
+      log_error(", +sa");
+    }
+    log_error(", p='");
+    cpus_dump(stderr,cmd->aff_cpus);
+    log_error("'");
+    log_error("\n");
+  }
+  if (cmd->aff_mode == aff_random) {
+    for (int k = 0 ; k < aff_cpus_sz ; k++) {
+      aff_cpus[k] = all_cpus->cpu[k % all_cpus->sz];
+    }
+  }
+  hist_t *hist = NULL;
+  int n_th = n_exe-1;
+  pthread_t th[n_th];
+  zyva_t zarg[n_exe];
+  pm_t *p_mutex = pm_create();
+  pb_t *p_barrier = pb_create(n_exe);
+  int next_cpu = 0;
+  int delta = cmd->aff_incr;
+  if (delta <= 0) {
+    for (int k=0 ; k < all_cpus->sz ; k++) all_cpus->cpu[k] = -1;
+    delta = 1;
+  } else {
+    delta %= all_cpus->sz;
+  }
+  int start_scan=0, max_start=gcd(delta,all_cpus->sz);
+  int *aff_p = aff_cpus;
+  for (int k=0 ; k < n_exe ; k++) {
+    zyva_t *p = &zarg[k];
+    p->_p = &prm;
+    p->p_mutex = p_mutex; p->p_barrier = p_barrier; 
+    p->z_id = k;
+    p->cpus = aff_p;
+    if (cmd->aff_mode != aff_incr) {
+      aff_p += N;
+    } else {
+      for (int i=0 ; i < N ; i++) {
+        *aff_p = all_cpus->cpu[next_cpu]; aff_p++;
+        next_cpu += delta; next_cpu %= all_cpus->sz;
+        if (next_cpu == start_scan) {
+          start_scan++ ; start_scan %= max_start;
+          next_cpu = start_scan;
+        }
+      }
+    }
+    if (k < n_th) {
+      launch(&th[k],zyva,p);
+    } else {
+      hist = (hist_t *)zyva(p);
+    }
+  }
+
+  count_t n_outs = prm.size_of_test; n_outs *= prm.max_run;
+  for (int k=0 ; k < n_th ; k++) {
+    hist_t *hk = (hist_t *)join(&th[k]);
+    if (sum_hist(hk) != n_outs || hk->n_pos + hk->n_neg != n_outs) {
+      fatal("ARMARM06, sum_hist");
+    }
+    merge_hists(hist,hk);
+    free_hist(hk);
+  }
+  cpus_free(all_cpus);
+  tsc_t total = timeofday() - start;
+  pm_free(p_mutex);
+  pb_free(p_barrier);
+
+  n_outs *= n_exe ;
+  if (sum_hist(hist) != n_outs || hist->n_pos + hist->n_neg != n_outs) {
+    fatal("ARMARM06, sum_hist") ;
+  }
+  count_t p_true = hist->n_pos, p_false = hist->n_neg;
+  int cond = postlude(out,cmd,hist,p_true,p_false,total);
+  free_hist(hist);
+  return cond;
+}
+
+
+int main(int argc, char **argv) {
+  cpus_t *def_all_cpus = read_force_affinity(AVAIL,0);
+  if (def_all_cpus->sz < N) {
+    cpus_free(def_all_cpus);
+    return EXIT_SUCCESS;
+  }
+  cmd_t def = { 0, NUMBER_OF_RUN, SIZE_OF_TEST, STRIDE, AVAIL, 0, 0, aff_incr, 0, 0, AFF_INCR, def_all_cpus, NULL, -1, MAX_LOOP, NULL, NULL, -1, -1, -1, 0, 0};
+  cmd_t cmd = def;
+  parse_cmd(argc,argv,&def,&cmd);
+  int cond = run(&cmd,def_all_cpus,stdout);
+  if (def_all_cpus != cmd.aff_cpus) cpus_free(def_all_cpus);
+  return cond ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/tcg/mttcg/aarch64/Makefile b/tests/tcg/mttcg/aarch64/Makefile
new file mode 100644
index 0000000..53e75f6
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/Makefile
@@ -0,0 +1,52 @@ 
+GCC=aarch64-linux-gnu-gcc
+GCCOPTS=-D_GNU_SOURCE -DFORCE_AFFINITY -Wall -std=gnu99 -O2 -pthread -static
+LINKOPTS=
+SRC=\
+ ARMARM00.c\
+ ARMARM01.c\
+ ARMARM02.c\
+ ARMARM03.c\
+ ARMARM04+BIS.c\
+ ARMARM04.c\
+ ARMARM04+TER.c\
+ ARMARM05.c\
+ ARMARM06+AP+AA.c\
+ ARMARM06+AP+AP.c\
+ ARMARM06.c\
+
+EXE=$(SRC:.c=.exe)
+T=$(SRC:.c=.t)
+
+all: $(EXE) $(T)
+
+clean:
+	/bin/rm -f *.o *.s *.t *.exe *~
+
+cleansource:
+	/bin/rm -f *.o *.c *.h *.s *~
+
+affinity.o: affinity.c
+	$(GCC) $(GCCOPTS) -O2 -c affinity.c
+
+outs.o: outs.c
+	$(GCC) $(GCCOPTS) -O2 -c outs.c
+
+utils.o: utils.c
+	$(GCC) $(GCCOPTS) -O2 -c utils.c
+
+litmus_rand.o: litmus_rand.c
+	$(GCC) $(GCCOPTS) -O2 -c litmus_rand.c
+
+UTILS=affinity.o outs.o utils.o litmus_rand.o
+
+%.exe:%.s $(UTILS)
+	$(GCC) $(GCCOPTS) $(LINKOPTS) -o $@ $(UTILS) $<
+
+%.s:%.c
+	$(GCC) $(GCCOPTS) -S $<
+
+%.t:%.s
+	awk -f show.awk $< > $@
+
+tests: all
+	./run.sh
diff --git a/tests/tcg/mttcg/aarch64/README.txt b/tests/tcg/mttcg/aarch64/README.txt
new file mode 100644
index 0000000..c160b8d
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/README.txt
@@ -0,0 +1,22 @@ 
+Tests produced by litmus for architecture AArch64 on linux 
+
+COMPILING
+  with command 'make [-j N]' or 'sh comp.sh'
+
+RUNNING ALL TESTS
+  with command 'sh run.sh'. Test result on standard output.
+
+RUNNING ONE TEST
+  Tests are .exe files, for instance ARMARM06.exe, run it by './ARMARM06.exe'
+
+RUNNING OPTIONS
+  Main options to the run.sh script and to .exe files:
+  -v     be verbose (can be repeated).
+  -a <n> number of (logical) processors available, default 0.
+      The default value of 0 means that .exe files attempt
+      to infer the actual number of logical threads.
+  -s <n> one run operates on arrays of size <n>, default 100000.
+  -r <n> number of runs, default 10.
+
+  For more options see for instance './ARMARM06.exe -help' and litmus documentation
+  <http://diy.inria.fr/doc/litmus.html>
diff --git a/tests/tcg/mttcg/aarch64/affinity.c b/tests/tcg/mttcg/aarch64/affinity.c
new file mode 100644
index 0000000..9535bf2
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/affinity.c
@@ -0,0 +1,159 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* Copyright 2015-present Institut National de Recherche en Informatique et */
+/* en Automatique and the authors. All rights reserved.                     */
+/*                                                                          */
+/* This software is governed by the CeCILL-B license under French law and   */
+/* abiding by the rules of distribution of free software. You can use,      */
+/* modify and/ or redistribute the software under the terms of the CeCILL-B */
+/* license as circulated by CEA, CNRS and INRIA at the following URL        */
+/* "http://www.cecill.info". We also give a copy in LICENSE.txt.            */
+/****************************************************************************/
+#include <stdio.h>
+#include <sched.h>
+#include <unistd.h>
+#include "utils.h"
+#include "affinity.h"
+
+#ifdef CPUS_DEFINED
+cpus_t *read_affinity(void) {
+  cpu_set_t mask;
+  int sz = 0 ;
+  int res = pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask) ;
+  
+  if (res != 0) { 
+    errexit("pthread_getaffinity_np",res);
+  }
+  for (int p=0 ; p <  CPU_SETSIZE ; p++) {
+    if (CPU_ISSET(p,&mask)) sz++ ;
+  }
+
+  cpus_t *r = cpus_create(sz) ;
+  for (int p=0, *q=r->cpu ; p <  CPU_SETSIZE ; p++) {
+    if (CPU_ISSET(p,&mask)) *q++ = p ;
+  }
+  return r ;
+}
+
+#endif
+/* Attempt to force processors wake up, on devices where unused procs
+   go to sleep... */
+
+
+#ifdef FORCE_AFFINITY
+const static tsc_t sec = (tsc_t)1000000 ;
+
+static void* loop(void *p)  {
+  tsc_t *q = p ;
+  tsc_t max = *q ;
+  while (timeofday() < max) ;
+  return NULL ;
+}
+
+
+static void warm_up(int sz, tsc_t d) {
+    pthread_t th[sz];
+    d += timeofday() ;
+    for (int k = 0 ; k < sz ; k++) launch(&th[k], loop, &d) ;
+    for (int k = 0 ; k < sz ; k++) join(&th[k]) ;
+}
+
+#ifdef CPUS_DEFINED
+cpus_t *read_force_affinity(int n_avail, int verbose) {
+  int sz = n_avail <= 1 ? 1 : n_avail ;
+  tsc_t max = sec / 100 ;
+
+  for ( ; ; ) {
+    warm_up(sz+1,max) ;
+    cpus_t *r = read_affinity() ;
+    if (n_avail <= r->sz) return r ;
+    if (verbose) {
+      fprintf(stderr,"Read affinity: '") ;
+      cpus_dump(stderr,r) ;
+      fprintf(stderr,"'\n") ;
+    }
+    cpus_free(r) ;
+  }
+}
+#endif
+#endif
+
+#ifdef CPUS_DEFINED
+
+/* Enforcing processor affinity.
+   Notice that logical processor numbers may be negative.
+   In that case, affinity setting is ignored */
+ 
+
+void write_affinity(cpus_t *p) {
+  cpu_set_t mask;
+  int exists_pos = 0 ;
+
+  CPU_ZERO(&mask) ;
+  for (int k = 0 ; k < p->sz ; k++) {
+    if (p->cpu[k] >= 0) {
+      CPU_SET(p->cpu[k],&mask) ;
+      exists_pos = 1 ;
+    }
+  }
+  if  (exists_pos) {
+    int r = pthread_setaffinity_np(pthread_self(),sizeof(mask),&mask) ;
+    if (r != 0) {
+      errexit("pthread_setaffinity_np",r) ;
+    }
+  }
+}
+#endif
+
+void write_one_affinity(int a) {
+  if (a >= 0) {
+    cpu_set_t mask;
+    CPU_ZERO(&mask) ;
+    CPU_SET(a,&mask) ;
+    int r = pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) ;
+    if (r != 0) {
+      errexit("pthread_setaffinity_np",r) ;
+    }
+  }
+}
+
+#ifdef FORCE_AFFINITY
+/* Get the number of present cpus, fragile */
+
+static const char *present = "/sys/devices/system/cpu/present" ;
+
+static int get_present(void) {
+  FILE *fp = fopen(present,"r") ;
+  if (fp == NULL) return -1 ;
+  int r1,r2 ;
+  int n = fscanf(fp,"%d-%d\n",&r1,&r2) ;
+  fclose(fp) ;
+  if (n != 2) return -1 ;
+  return r2-r1+1 ;
+}
+
+void force_one_affinity(int a, int sz,int verbose, char *name) {
+  if (a >= 0) {
+    cpu_set_t mask;
+    int r ;
+    CPU_ZERO(&mask) ;
+    CPU_SET(a,&mask) ;
+    do {
+      r = pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) ;
+      if (r != 0) {
+        if (verbose)
+          fprintf(stderr,"%s: force %i failed\n",name,a) ;
+        int nwarm = get_present() ;
+        if (verbose > 1)
+          fprintf(stderr,"%s: present=%i\n",name,nwarm) ;
+        if (nwarm < 0) nwarm = sz+1 ;
+        warm_up(nwarm,sec/100) ;
+      }
+    } while (r != 0) ;
+  }
+}
+#endif
diff --git a/tests/tcg/mttcg/aarch64/affinity.h b/tests/tcg/mttcg/aarch64/affinity.h
new file mode 100644
index 0000000..9fb6a25
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/affinity.h
@@ -0,0 +1,34 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* Copyright 2015-present Institut National de Recherche en Informatique et */
+/* en Automatique and the authors. All rights reserved.                     */
+/*                                                                          */
+/* This software is governed by the CeCILL-B license under French law and   */
+/* abiding by the rules of distribution of free software. You can use,      */
+/* modify and/ or redistribute the software under the terms of the CeCILL-B */
+/* license as circulated by CEA, CNRS and INRIA at the following URL        */
+/* "http://www.cecill.info". We also give a copy in LICENSE.txt.            */
+/****************************************************************************/
+#ifndef _AFFINITY_H
+#define _AFFINITY_H 1
+
+#include "utils.h"
+
+#ifdef CPUS_DEFINED
+cpus_t *read_affinity(void) ;
+#ifdef FORCE_AFFINITY
+cpus_t *read_force_affinity(int n_avail, int verbose) ;
+#endif
+void write_affinity(cpus_t *p) ;
+#endif
+
+void write_one_affinity(int cpu) ;
+#ifdef FORCE_AFFINITY
+void force_one_affinity(int cpu, int sz, int verbose, char *name) ;
+#endif
+
+#endif
diff --git a/tests/tcg/mttcg/aarch64/comp.sh b/tests/tcg/mttcg/aarch64/comp.sh
new file mode 100644
index 0000000..62d65ba
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/comp.sh
@@ -0,0 +1,30 @@ 
+GCC=gcc
+GCCOPTS="-D_GNU_SOURCE -DFORCE_AFFINITY -Wall -std=gnu99 -O2 -pthread"
+LINKOPTS=""
+/bin/rm -f *.exe *.s
+$GCC $GCCOPTS -O2 -c affinity.c
+$GCC $GCCOPTS -O2 -c outs.c
+$GCC $GCCOPTS -O2 -c utils.c
+$GCC $GCCOPTS -O2 -c litmus_rand.c
+$GCC $GCCOPTS $LINKOPTS -o ARMARM00.exe affinity.o outs.o utils.o litmus_rand.o ARMARM00.c
+$GCC $GCCOPTS -S ARMARM00.c && awk -f show.awk ARMARM00.s > ARMARM00.t && /bin/rm ARMARM00.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM01.exe affinity.o outs.o utils.o litmus_rand.o ARMARM01.c
+$GCC $GCCOPTS -S ARMARM01.c && awk -f show.awk ARMARM01.s > ARMARM01.t && /bin/rm ARMARM01.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM02.exe affinity.o outs.o utils.o litmus_rand.o ARMARM02.c
+$GCC $GCCOPTS -S ARMARM02.c && awk -f show.awk ARMARM02.s > ARMARM02.t && /bin/rm ARMARM02.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM03.exe affinity.o outs.o utils.o litmus_rand.o ARMARM03.c
+$GCC $GCCOPTS -S ARMARM03.c && awk -f show.awk ARMARM03.s > ARMARM03.t && /bin/rm ARMARM03.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM04+BIS.exe affinity.o outs.o utils.o litmus_rand.o ARMARM04+BIS.c
+$GCC $GCCOPTS -S ARMARM04+BIS.c && awk -f show.awk ARMARM04+BIS.s > ARMARM04+BIS.t && /bin/rm ARMARM04+BIS.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM04.exe affinity.o outs.o utils.o litmus_rand.o ARMARM04.c
+$GCC $GCCOPTS -S ARMARM04.c && awk -f show.awk ARMARM04.s > ARMARM04.t && /bin/rm ARMARM04.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM04+TER.exe affinity.o outs.o utils.o litmus_rand.o ARMARM04+TER.c
+$GCC $GCCOPTS -S ARMARM04+TER.c && awk -f show.awk ARMARM04+TER.s > ARMARM04+TER.t && /bin/rm ARMARM04+TER.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM05.exe affinity.o outs.o utils.o litmus_rand.o ARMARM05.c
+$GCC $GCCOPTS -S ARMARM05.c && awk -f show.awk ARMARM05.s > ARMARM05.t && /bin/rm ARMARM05.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM06+AP+AA.exe affinity.o outs.o utils.o litmus_rand.o ARMARM06+AP+AA.c
+$GCC $GCCOPTS -S ARMARM06+AP+AA.c && awk -f show.awk ARMARM06+AP+AA.s > ARMARM06+AP+AA.t && /bin/rm ARMARM06+AP+AA.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM06+AP+AP.exe affinity.o outs.o utils.o litmus_rand.o ARMARM06+AP+AP.c
+$GCC $GCCOPTS -S ARMARM06+AP+AP.c && awk -f show.awk ARMARM06+AP+AP.s > ARMARM06+AP+AP.t && /bin/rm ARMARM06+AP+AP.s
+$GCC $GCCOPTS $LINKOPTS -o ARMARM06.exe affinity.o outs.o utils.o litmus_rand.o ARMARM06.c
+$GCC $GCCOPTS -S ARMARM06.c && awk -f show.awk ARMARM06.s > ARMARM06.t && /bin/rm ARMARM06.s
diff --git a/tests/tcg/mttcg/aarch64/litmus_rand.c b/tests/tcg/mttcg/aarch64/litmus_rand.c
new file mode 100644
index 0000000..de33032
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/litmus_rand.c
@@ -0,0 +1,64 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* Copyright 2015-present Institut National de Recherche en Informatique et */
+/* en Automatique and the authors. All rights reserved.                     */
+/*                                                                          */
+/* This software is governed by the CeCILL-B license under French law and   */
+/* abiding by the rules of distribution of free software. You can use,      */
+/* modify and/ or redistribute the software under the terms of the CeCILL-B */
+/* license as circulated by CEA, CNRS and INRIA at the following URL        */
+/* "http://www.cecill.info". We also give a copy in LICENSE.txt.            */
+/****************************************************************************/
+#include <stdint.h>
+#include "litmus_rand.h"
+
+/*
+  Simple generator
+  http://en.wikipedia.org/wiki/Linear_congruential_generator
+*/
+
+
+/*
+
+  From ocaml sources: (globroot.c)
+  Linear congruence with modulus = 2^32, multiplier = 69069
+  (Knuth vol 2 p. 106, line 15 of table 1), additive = 25173.
+
+
+  Knuth (vol 2 p. 13) shows that the least significant bits are
+  "less random" than the most significant bits with a modulus of 2^m.
+  We just swap half words, enough? */
+
+static const uint32_t a = 69069;
+static const uint32_t c = 25173 ;
+
+inline static uint32_t unlocked_rand(st_t *st)  {
+  uint32_t r = a * *st + c ;
+  *st = r ;
+  /* Swap high & low bits */
+  uint32_t low = r & 0xffff ;
+  uint32_t high = r >> 16 ;
+  r = high | (low << 16) ;
+  return r ;
+}
+
+int rand_bit(st_t *st)  {
+  uint32_t r = unlocked_rand(st) ;
+  r &= 1 ;
+  return r ; 
+}
+
+static const uint32_t r_max = UINT32_MAX ;
+
+uint32_t rand_k (uint32_t *st,uint32_t k) {
+  uint32_t r, v ;
+  do {
+    r = unlocked_rand(st) ;
+    v = r % k ;
+  } while (r-v > r_max-k+1) ;
+  return v ;
+}
diff --git a/tests/tcg/mttcg/aarch64/litmus_rand.h b/tests/tcg/mttcg/aarch64/litmus_rand.h
new file mode 100644
index 0000000..c358ccb
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/litmus_rand.h
@@ -0,0 +1,29 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* Copyright 2015-present Institut National de Recherche en Informatique et */
+/* en Automatique and the authors. All rights reserved.                     */
+/*                                                                          */
+/* This software is governed by the CeCILL-B license under French law and   */
+/* abiding by the rules of distribution of free software. You can use,      */
+/* modify and/ or redistribute the software under the terms of the CeCILL-B */
+/* license as circulated by CEA, CNRS and INRIA at the following URL        */
+/* "http://www.cecill.info". We also give a copy in LICENSE.txt.            */
+/****************************************************************************/
+#ifndef _LITMUS_RAND_H
+#define _LITMUS_RAND_H 1
+
+#include <stdint.h>
+
+/* type of state for pseudorandom  generators */
+typedef uint32_t st_t ;
+
+/* Unlocked random bit */
+
+int rand_bit(st_t *st) ;
+uint32_t rand_k(st_t *st,uint32_t n) ;
+
+#endif
diff --git a/tests/tcg/mttcg/aarch64/outs.c b/tests/tcg/mttcg/aarch64/outs.c
new file mode 100644
index 0000000..178f1d2
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/outs.c
@@ -0,0 +1,148 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* Copyright 2015-present Institut National de Recherche en Informatique et */
+/* en Automatique and the authors. All rights reserved.                     */
+/*                                                                          */
+/* This software is governed by the CeCILL-B license under French law and   */
+/* abiding by the rules of distribution of free software. You can use,      */
+/* modify and/ or redistribute the software under the terms of the CeCILL-B */
+/* license as circulated by CEA, CNRS and INRIA at the following URL        */
+/* "http://www.cecill.info". We also give a copy in LICENSE.txt.            */
+/****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include "outs.h"
+
+/**********************/
+/* Lexicographic tree */
+/**********************/
+
+#if 0
+static void debug(int *t, int i, int j) {
+  for (int k=i ; k <= j ; k++)
+    fprintf(stderr,"%i",t[k]) ;
+  fprintf(stderr,"\n") ;
+}
+#endif
+
+
+void *malloc_check(size_t sz) ;
+
+static outs_t *alloc_outs(intmax_t k) {
+  outs_t *r = malloc_check(sizeof(*r)) ;
+  r->k = k ;
+  r->c = 0 ;
+  r->show = 0 ;
+  r->next = r->down = NULL ;
+  return r ;
+}
+
+void free_outs(outs_t *p) {
+  if (p == NULL) return ;
+  free_outs(p->next) ;
+  free_outs(p->down) ;
+  free(p) ;
+}
+
+/* Worth writing as a loop, since called many times */
+static outs_t *loop_add_outcome_outs(outs_t *p, intmax_t *k, int i, count_t c, int show) {
+  outs_t *r = p ;
+  if (p == NULL || k[i] < p->k) {
+    r = alloc_outs(k[i]) ;
+    r->next = p ;
+    p = r ;
+  }
+  for ( ; ; ) {
+    outs_t **q ;
+    if (k[i] > p->k) {
+      q = &(p->next) ;
+      p = p->next ;
+    } else if (i <= 0) {
+      p->c += c ;
+      p->show = show || p->show ;
+      return r ;
+    } else {
+      i-- ;
+      q = &(p->down) ;
+      p = p->down ;
+    }
+    if (p == NULL || k[i] < p->k) {
+      outs_t *a = alloc_outs(k[i]) ;
+      a->next = p ;
+      p = a ;
+      *q = a ;
+    }
+  }
+}
+
+outs_t *add_outcome_outs(outs_t *p, intmax_t *k, int sz, count_t c, int show) {
+  return loop_add_outcome_outs(p,k,sz-1,c,show) ;
+}
+
+count_t sum_outs(outs_t *p) {
+  count_t r = 0 ;
+  for ( ; p ; p = p->next) {
+    r += p->c ;
+    r += sum_outs(p->down) ;
+  }
+  return r ;
+}
+
+int finals_outs(outs_t *p) {
+  int r = 0 ;
+  for ( ; p ; p = p->next) {
+    if (p->c > 0) r++ ;
+    r += finals_outs(p->down) ;
+  }
+  return r ;
+}
+
+void dump_outs (FILE *chan, dump_outcome *dout,outs_t *p, intmax_t *buff,int sz) {
+  for ( ; p ; p = p->next) {
+    buff[sz-1] = p->k ;
+    if (p->c > 0) {
+      dout(chan,buff,p->c,p->show) ;
+    } else if (p->down) {
+      dump_outs(chan,dout,p->down,buff,sz-1) ;
+    }
+  }
+}
+
+/* merge p and q into p */
+static outs_t *do_merge_outs(outs_t *p, outs_t *q) {
+  if (q == NULL) { // Nothing to add
+    return p ;
+  }
+  if (p == NULL || q->k < p->k) { // Need a cell
+    outs_t *r = alloc_outs(q->k) ;
+    r->next = p ;
+    p = r ;
+  }
+  if (p->k == q->k) {
+    p->c += q->c ;
+    p->show = p->show || q->show ;
+    p->down = do_merge_outs(p->down,q->down) ;
+    p->next = do_merge_outs(p->next,q->next) ;
+  } else {
+    p->next = do_merge_outs(p->next,q) ;
+  }
+  return p ;
+}
+
+outs_t *merge_outs(outs_t *p, outs_t *q, int sz) {
+  return do_merge_outs(p,q) ;
+}
+
+int same_outs(outs_t *p,outs_t *q) {
+  while (p && q) {
+    if (p->k != q->k || p->c != q->c || p->show != q->show) return 0 ;
+    if (!same_outs(p->down,q->down)) return 0 ;
+    p = p->next ;
+    q = q->next ;
+  }
+  return p == q ; /* == NULL */
+}
diff --git a/tests/tcg/mttcg/aarch64/outs.h b/tests/tcg/mttcg/aarch64/outs.h
new file mode 100644
index 0000000..761590f
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/outs.h
@@ -0,0 +1,49 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* Copyright 2015-present Institut National de Recherche en Informatique et */
+/* en Automatique and the authors. All rights reserved.                     */
+/*                                                                          */
+/* This software is governed by the CeCILL-B license under French law and   */
+/* abiding by the rules of distribution of free software. You can use,      */
+/* modify and/ or redistribute the software under the terms of the CeCILL-B */
+/* license as circulated by CEA, CNRS and INRIA at the following URL        */
+/* "http://www.cecill.info". We also give a copy in LICENSE.txt.            */
+/****************************************************************************/
+#ifndef _OUTS_H
+#define _OUTS_H 1
+
+#include <stdio.h>
+
+/************************/
+/* Histogram structure  */
+/************************/
+
+
+/* 64bit counters, should be enough! */
+#include <inttypes.h>
+typedef uint64_t count_t;
+#define PCTR PRIu64
+
+
+
+
+typedef struct outs_t {
+  struct outs_t *next,*down ;
+  count_t c ;
+  intmax_t k ;
+  int show ;
+} outs_t ;
+
+void free_outs(outs_t *p) ;
+outs_t *add_outcome_outs(outs_t *p, intmax_t *o, int sz, count_t v, int show) ;
+int finals_outs(outs_t *p) ;
+count_t sum_outs(outs_t *p) ;
+typedef void dump_outcome(FILE *chan, intmax_t *o, count_t c, int show) ;
+void dump_outs (FILE *chan, dump_outcome *dout,outs_t *p, intmax_t *buff, int sz) ;
+outs_t *merge_outs(outs_t *p,outs_t *q, int sz) ;
+int same_outs(outs_t *p,outs_t *q) ;
+#endif
diff --git a/tests/tcg/mttcg/aarch64/run.sh b/tests/tcg/mttcg/aarch64/run.sh
new file mode 100755
index 0000000..214709e
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/run.sh
@@ -0,0 +1,351 @@ 
+date
+LITMUSOPTS="${@:-$LITMUSOPTS}"
+QEMU=../../../../aarch64-linux-user/qemu-aarch64
+SLEEP=0
+if [ ! -f ARMARM00.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM00.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM00
+"PodWWPL RfeLA PodRRAP Fre"
+
+{0:X1=x; 0:X3=y; 1:X1=y; 1:X3=x; 1:X2=-1;}
+
+ P0           | P1           ;
+ MOV W0,#1    | LDAR W0,[X1] ;
+ STR W0,[X1]  | CMP W0,#1    ;
+ MOV W2,#1    | B.NE Exit1   ;
+ STLR W2,[X3] | LDR W2,[X3]  ;
+              | Exit1:       ;
+
+~exists (1:X0=1 /\ 1:X2=0)
+Generated assembler
+EOF
+cat ARMARM00.t
+$QEMU ./ARMARM00.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM01.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM01.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM01
+"PodWWPL RfeLP DpAddrdR Fre"
+
+{0:X1=x; 0:X3=y; 1:X1=y; 1:X4=x; 1:X3=-1;}
+
+ P0           | P1                  ;
+ MOV W0,#1    | LDR W0,[X1]         ;
+ STR W0,[X1]  | CMP W0,#1           ;
+ MOV W2,#1    | B.NE Exit1          ;
+ STLR W2,[X3] | EOR W2,W0,W0        ;
+              | LDR W3,[X4,W2,SXTW] ;
+              | Exit1:              ;
+
+~exists (1:X0=1 /\ 1:X3=0)
+Generated assembler
+EOF
+cat ARMARM01.t
+$QEMU ./ARMARM01.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM02.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM02.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM02
+
+{0:X1=x; 0:X3=y; 1:X1=y; 1:X4=x; 1:X3=-1; 2:X1=y; 2:X4=x; 2:X3=-1;}
+
+ P0           | P1                  | P2                  ;
+ MOV W0,#1    | LDR W0,[X1]         | LDR W0,[X1]         ;
+ STR W0,[X1]  | CMP W0,#1           | CMP W0,#1           ;
+ MOV W2,#1    | B.NE Exit1          | B.NE Exit2          ;
+ STLR W2,[X3] | EOR W2,W0,W0        | EOR W2,W0,W0        ;
+              | LDR W3,[X4,W2,SXTW] | LDR W3,[X4,W2,SXTW] ;
+              | Exit1:              | Exit2:              ;
+
+~exists (1:X0=1 /\ 1:X3=0 \/ 2:X0=1 /\ 2:X3=0)
+Generated assembler
+EOF
+cat ARMARM02.t
+$QEMU ./ARMARM02.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM03.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM03.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM03
+"PodWWPL RfeLP DpAddrdR Fre"
+
+{y=z; z=-1; 0:X1=x; 0:X3=y; 1:X3=y; 1:X9=-1;}
+
+ P0           | P1          ;
+ MOV W0,#1    | LDR X0,[X3] ;
+ STR W0,[X1]  | LDR W9,[X0] ;
+ STLR X1,[X3] |             ;
+
+~exists (1:X0=x /\ 1:X9=0)
+Generated assembler
+EOF
+cat ARMARM03.t
+$QEMU ./ARMARM03.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM04+BIS.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM04+BIS.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM04+BIS
+"RfeLA PodRWAP Rfe DpAddrdRPA FreAL"
+
+{0:X1=x; 1:X1=x; 1:X3=y; 2:X1=y; 2:X4=x; 2:X3=-1;}
+
+ P0           | P1           | P2                ;
+ MOV W0,#1    | LDAR W0,[X1] | LDR W0,[X1]       ;
+ STLR W0,[X1] | CMP W0,#1    | CMP W0,#1         ;
+              | B.NE Exit1   | B.NE Exit2        ;
+              | MOV W2,#1    | EOR W2,W0,W0      ;
+              | STR W2,[X3]  | ADD X5,X4,W2,SXTW ;
+              | Exit1:       | LDR W3,[X5]       ;
+              |              | Exit2:            ;
+
+exists (1:X0=1 /\ 2:X0=1 /\ 2:X3=0)
+Generated assembler
+EOF
+cat ARMARM04+BIS.t
+$QEMU ./ARMARM04+BIS.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM04.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM04.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM04
+"RfeLA PodRWAP Rfe DpAddrdRPA FreAL"
+
+{0:X1=x; 1:X1=x; 1:X3=y; 2:X1=y; 2:X4=x; 2:X3=-1;}
+
+ P0           | P1           | P2                ;
+ MOV W0,#1    | LDAR W0,[X1] | LDR W0,[X1]       ;
+ STLR W0,[X1] | CMP W0,#1    | CMP W0,#1         ;
+              | B.NE Exit1   | B.NE Exit2        ;
+              | MOV W2,#1    | EOR W2,W0,W0      ;
+              | STR W2,[X3]  | ADD X5,X4,W2,SXTW ;
+              | Exit1:       | LDAR W3,[X5]      ;
+              |              | Exit2:            ;
+
+~exists (1:X0=1 /\ 2:X0=1 /\ 2:X3=0)
+Generated assembler
+EOF
+cat ARMARM04.t
+$QEMU ./ARMARM04.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM04+TER.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM04+TER.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM04+TER
+"RfeLA PodRWAP Rfe PodRRPA FreAL"
+
+{0:X1=x; 1:X1=x; 1:X3=y; 2:X1=y; 2:X3=x;}
+
+ P0           | P1           | P2           ;
+ MOV W0,#1    | LDAR W0,[X1] | LDR W0,[X1]  ;
+ STLR W0,[X1] | MOV W2,#1    | LDAR W2,[X3] ;
+              | STR W2,[X3]  |              ;
+
+exists (1:X0=1 /\ 2:X0=1 /\ 2:X2=0)
+Generated assembler
+EOF
+cat ARMARM04+TER.t
+$QEMU ./ARMARM04+TER.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM05.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM05.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM05
+"Rfe PodRWPL RfeLP DpAddrdR Fre"
+
+{0:X1=x; 1:X1=x; 1:X3=y; 2:X1=y; 2:X4=x; 2:X3=-1;}
+
+ P0          | P1           | P2                  ;
+ MOV W0,#1   | LDR W0,[X1]  | LDR W0,[X1]         ;
+ STR W0,[X1] | CMP W0,#1    | CMP W0,#1           ;
+             | B.NE Exit1   | B.NE Exit2          ;
+             | MOV W2,#1    | EOR W2,W0,W0        ;
+             | STLR W2,[X3] | LDR W3,[X4,W2,SXTW] ;
+             | Exit1:       | Exit2:              ;
+
+~exists (1:X0=1 /\ 2:X0=1 /\ 2:X3=0)
+Generated assembler
+EOF
+cat ARMARM05.t
+$QEMU ./ARMARM05.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM06+AP+AA.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM06+AP+AA.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM06+AP+AA
+"RfeLA PodRRAP FrePL RfeLA PodRRAA FreAL"
+
+{0:X1=x; 1:X1=x; 1:X3=y; 2:X1=y; 3:X1=y; 3:X3=x;}
+
+ P0           | P1           | P2           | P3           ;
+ MOV W0,#1    | LDAR W0,[X1] | MOV W0,#1    | LDAR W0,[X1] ;
+ STLR W0,[X1] | LDR W2,[X3]  | STLR W0,[X1] | LDAR W2,[X3] ;
+
+exists (1:X0=1 /\ 1:X2=0 /\ 3:X0=1 /\ 3:X2=0)
+Generated assembler
+EOF
+cat ARMARM06+AP+AA.t
+$QEMU ./ARMARM06+AP+AA.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM06+AP+AP.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM06+AP+AP.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM06+AP+AP
+"RfeLA PodRRAP FrePL RfeLA PodRRAP FrePL"
+
+{0:X1=x; 1:X1=x; 1:X3=y; 2:X1=y; 3:X1=y; 3:X3=x;}
+
+ P0           | P1           | P2           | P3           ;
+ MOV W0,#1    | LDAR W0,[X1] | MOV W0,#1    | LDAR W0,[X1] ;
+ STLR W0,[X1] | LDR W2,[X3]  | STLR W0,[X1] | LDR W2,[X3]  ;
+
+exists (1:X0=1 /\ 1:X2=0 /\ 3:X0=1 /\ 3:X2=0)
+Generated assembler
+EOF
+cat ARMARM06+AP+AP.t
+$QEMU ./ARMARM06+AP+AP.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+if [ ! -f ARMARM06.no ]; then
+cat <<'EOF'
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Results for ../alex_litmus/aarch64.tests/HAND/ARMARM06.litmus %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+AArch64 ARMARM06
+"RfeLA PodRRAA FreAL RfeLA PodRRAA FreAL"
+
+{0:X1=x; 1:X1=x; 1:X3=y; 2:X1=y; 3:X1=y; 3:X3=x;}
+
+ P0           | P1           | P2           | P3           ;
+ MOV W0,#1    | LDAR W0,[X1] | MOV W0,#1    | LDAR W0,[X1] ;
+ STLR W0,[X1] | LDAR W2,[X3] | STLR W0,[X1] | LDAR W2,[X3] ;
+
+~exists (1:X0=1 /\ 1:X2=0 /\ 3:X0=1 /\ 3:X2=0)
+Generated assembler
+EOF
+cat ARMARM06.t
+$QEMU ./ARMARM06.exe -q $LITMUSOPTS
+ret=$?
+if [ $ret -eq 1 ]; then
+    echo "FAILED";
+    exit 1;
+fi
+fi
+sleep $SLEEP
+
+cat <<'EOF'
+Revision exported, version 7.22
+Command line: ../litmus-7.22/litmus -exit true -mach ../alex_litmus/overdrive01 -o run.armarm ../alex_litmus/aarch64.tests/HAND/ARMARM00.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM01.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM02.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM03.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM04+BIS.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM04.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM04+TER.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM05.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM06+AP+AA.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM06+AP+AP.litmus ../alex_litmus/aarch64.tests/HAND/ARMARM06.litmus
+Parameters
+#define SIZE_OF_TEST 100000
+#define NUMBER_OF_RUN 10
+#define AVAIL 0
+#define STRIDE 1
+#define MAX_LOOP 0
+/* gcc options: -D_GNU_SOURCE -DFORCE_AFFINITY -Wall -std=gnu99 -O2 -pthread */
+/* barrier: user */
+/* launch: changing */
+/* affinity: incr0 */
+/* alloc: dynamic */
+/* memory: direct */
+/* stride: 1 */
+/* safer: write */
+/* preload: random */
+/* speedcheck: no */
+/* proc used: 0 */
+EOF
+head -1 comp.sh
+echo "LITMUSOPTS=$LITMUSOPTS"
+date
diff --git a/tests/tcg/mttcg/aarch64/show.awk b/tests/tcg/mttcg/aarch64/show.awk
new file mode 100644
index 0000000..c8ecf20
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/show.awk
@@ -0,0 +1,2 @@ 
+/START _litmus_P/ { print $0 }
+/_litmus_P[0-9]+_[0-9]+/ { getline; print $0 ; }
diff --git a/tests/tcg/mttcg/aarch64/utils.c b/tests/tcg/mttcg/aarch64/utils.c
new file mode 100644
index 0000000..cc989b0
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/utils.c
@@ -0,0 +1,1148 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* Copyright 2015-present Institut National de Recherche en Informatique et */
+/* en Automatique and the authors. All rights reserved.                     */
+/*                                                                          */
+/* This software is governed by the CeCILL-B license under French law and   */
+/* abiding by the rules of distribution of free software. You can use,      */
+/* modify and/ or redistribute the software under the terms of the CeCILL-B */
+/* license as circulated by CEA, CNRS and INRIA at the following URL        */
+/* "http://www.cecill.info". We also give a copy in LICENSE.txt.            */
+/****************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <limits.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include "utils.h"
+
+/********/
+/* Misc */
+/********/
+
+FILE *errlog ;
+
+static void checkerrlog(void) {
+  if (!errlog) errlog = stderr ;
+}
+
+void seterrlog(FILE *chan) {
+  errlog = chan ;
+}
+
+int log_error(const char *fmt, ...) {
+  int result;
+  va_list args;
+  va_start(args, fmt);
+  checkerrlog() ;
+  result = vfprintf(errlog, fmt, args);
+  fflush(errlog);
+  va_end(args);
+  return result;
+}
+
+void fatal(char *msg) {
+  log_error("Failure: %s\n", msg) ;
+  fclose(errlog);
+  fprintf(stdout,"Failure: %s\n", msg) ;
+  exit(1) ;
+}
+
+void errexit(char *msg,int err) {
+  log_error("%s: %s\n",msg,strerror(err)) ;
+  fclose(errlog);
+  exit(2) ;
+}
+
+void *malloc_check(size_t sz) {
+  if (sz == 0) return NULL ;
+  void *p = malloc(sz) ;
+  if (!p) {
+    if (!errno) errno = ENOMEM ;
+    errexit("malloc",errno) ;
+  }
+  return p ;
+}
+
+int max(int n, int m) { return n < m ? m : n ; }
+
+void pp_ints(FILE *fp,int *p,int n) {
+  if (n > 0) {
+    fprintf(fp,"%i",p[0]) ;
+    for (int k = 1 ; k < n ; k++) {
+      fprintf(fp,",%i",p[k]) ;
+    }
+  }
+}
+
+
+void *do_align(void *p,size_t sz) {
+  uintptr_t x = (uintptr_t)p ;
+  x += sz-1 ;
+  x /= sz ;
+  x *= sz ;
+  return (void *)x ;
+}
+
+void *do_noalign(void *p,size_t sz) {
+  void *q = do_align(p,sz) ;
+  void *r = q - sz/2 ;
+  if (r < p) r = q + sz/2 ;
+  return r ;
+}
+
+void cat_file(char *path, char *msg, FILE *out) {
+  FILE *fp = fopen(path,"r") ;
+  if (fp == NULL) return ;
+  fprintf(out,"%s\n",msg) ;
+  int c,nl=1 ;
+  while ((c = fgetc(fp)) != EOF) {
+    fputc(c,out) ;
+    nl = c == '\n' ;
+  }
+  fclose(fp) ;
+  if (!nl) fputc('\n',out) ;
+}
+
+/************/
+/* CPU sets */
+/************/
+
+cpus_t *cpus_create(int sz) {
+  cpus_t *r = malloc_check(sizeof(*r)) ;
+  r->sz = sz ;
+  r->cpu = malloc_check(sizeof(r->cpu[0])*sz)  ;
+  return r ;
+}
+
+cpus_t *cpus_create_init(int sz, int t[]) {
+  cpus_t *r = cpus_create(sz) ;
+  for (int k = 0 ; k < sz ; k++) r->cpu[k] = t[k] ;
+  return r ;
+}
+
+void cpus_free(cpus_t *p) {
+  free(p->cpu) ;
+  free(p) ;
+}
+
+void cpus_dump(FILE *fp, cpus_t *p) {
+  pp_ints(fp,p->cpu,p->sz) ;
+}
+
+void cpus_dump_test(FILE *fp, int *p, int sz, cpus_t *cm,int nprocs) {
+  for (int k = 0 ; k < sz ; k += nprocs) {
+    fprintf(fp,"[") ;
+    pp_ints(fp,&p[k],nprocs) ;
+    fprintf(fp,"] {") ;
+    if (nprocs > 0) {
+      fprintf(fp,"%i",cm->cpu[p[k]]) ;
+      for (int i = 1 ; i < nprocs ; i++) {
+        fprintf(fp,",%i",cm->cpu[p[k+i]]) ;
+      }
+    }
+    fprintf(fp,"}\n") ;
+  }
+}
+
+/*************/
+/* Int array */
+/*************/
+
+
+void ints_dump(FILE *fp, ints_t *p) {
+  if (p->sz > 0) {
+    fprintf(fp,"%i:%i",0,p->t[0]) ;
+    for (int k = 1 ; k < p->sz ; k++) {
+      fprintf(fp,",%i:%i",k,p->t[k]) ;
+    }
+  }
+}
+
+/***********************/
+/* Prefetch directives */
+/***********************/
+void prefetch_dump(FILE *fp, prfdirs_t *p) {
+  prfproc_t *q = p->t ;
+  int some = 0 ;
+  for (int _p = 0 ; _p < p->nthreads ; _p++) {
+    int nvars = q[_p].nvars ;
+    prfone_t *r = q[_p].t ;
+    for (int _v = 0 ; _v < nvars ; _v++) {
+      prfdir_t dir = r[_v].dir ;
+      if (dir != none) {
+        char c = 'I' ;
+        if (dir == flush) c = 'F' ;
+        else if (dir == touch) c = 'T' ;
+        else if (dir == touch_store) c = 'W' ;
+        if (some) {
+          fprintf(fp,",") ;
+        } else {
+          some = 1 ;
+        }
+        fprintf(fp,"%i:%s=%c",_p,r[_v].name,c) ;
+      }
+    }
+  }
+}
+
+static void set_prefetch(prfdirs_t *p, prfdir_t d) {
+  prfproc_t *q = p->t ;
+  for (int _p = 0 ; _p < p->nthreads ; _p++) {
+    int nvars = q[_p].nvars ;
+    prfone_t *r = q[_p].t ;
+    for (int _v = 0 ; _v < nvars ; _v++) {
+      r[_v].dir = d ;
+    }
+  }
+}
+
+/* ??? */
+
+int gcd(int a, int b) {
+  for ( ; ; ) {
+    if (a == 0) return b ;
+    int tmp = a ;
+    a = b % a ;
+    b = tmp ;
+  }
+}
+
+/* SMT description */
+
+
+cpus_t *coremap_seq(int navail, int nways) {
+  cpus_t *r = cpus_create(navail) ;
+  int ncores = navail / nways ;
+  int i = 0 ;
+  for (int c = 0 ; c < ncores ; c++) {
+    for (int k = 0 ; k < nways ; k++) {
+      r->cpu[i++] = c ;
+    }
+  }
+  return r ;
+}
+
+cpus_t *coremap_end(int navail, int nways) {
+  cpus_t *r = cpus_create(navail) ;
+  int ncores = navail / nways ;
+  int i = 0 ;
+  for (int k = 0 ; k < nways ; k++) {
+    for (int c = 0 ; c < ncores ; c++) {
+      r->cpu[i++] = c ;
+    }
+  }
+  return r ;
+}
+
+typedef struct {
+  int ncores ;
+  cpus_t **core ;
+} mapcore_t ;
+
+
+static void mapcore_free(mapcore_t *p) {
+  for (int c = 0 ; c < p->ncores ; c++) cpus_free(p->core[c]) ;
+  free(p->core) ;
+  free(p) ;
+}
+
+#if 0
+static mapcore_t *inverse_coremap(cpus_t *p, int nways) {
+  mapcore_t *r = malloc_check(sizeof(*r)) ;
+  r->ncores = p->sz / nways ;
+  r->core = malloc_check(r->ncores * sizeof(r->core[0])) ;
+  for (int k = 0 ; k < r->ncores ; k++) {
+    r->core[k] = cpus_create(nways) ;
+    r->core[k]->sz = 0 ;
+  }
+  for (int k = 0 ; k < p->sz ; k++) {
+    int c = p->cpu[k] ;
+    cpus_t *q = r->core[c] ;
+    q->cpu[q->sz++] = k ;
+  }
+  return r ;
+}
+#endif
+
+static int get_ncores(cpus_t *cm) {
+  int r = 0;
+  for (int k = 0 ; k < cm->sz ; k++) {
+    if (cm->cpu[k] > r) r = cm->cpu[k] ;
+  }
+  return r+1 ;
+}
+
+cpus_t *get_core_procs(cpus_t *cm, cpus_t *p,int c) {
+  int sz = 0 ;
+  cpus_t *r ;
+  for (int k = 0 ; k < p->sz ; k++) {
+    if (cm->cpu[p->cpu[k]] == c) sz++ ;
+  }
+  r = cpus_create(sz) ;
+  int i = 0 ;
+  for (int k = 0 ; k < p->sz ; k++) {
+    int proc = p->cpu[k] ;
+    if (cm->cpu[proc] == c) r->cpu[i++] = proc ;
+  }
+  return r ;
+}
+
+static  mapcore_t *inverse_procs(cpus_t *cm, cpus_t *p) {
+  int ncores = get_ncores(cm) ;
+  mapcore_t *r = malloc_check(sizeof(*r)) ;
+  r->ncores = ncores ;
+  r->core = malloc_check(sizeof(r->core[0])*ncores) ;
+  for (int c = 0 ; c < ncores ; c++) {
+    r->core[c] = get_core_procs(cm,p,c) ;
+  }
+  return r ;
+}
+
+static int get_node_sz(int *p) {
+  int r = 0 ;
+  while (*p++ >= 0) r++ ;
+  return r ;
+}
+
+static int get_n(int **p) {
+  int r = 0 ;
+  while (*p) {
+    r += get_node_sz(*p) ;
+    p++ ;
+  }
+  return r ;
+}
+
+static int ok_one_color(int *cm,int *d,int *a,int n, int p, int c) {
+  for (int k = 0 ; k < n ; k++) {
+    int op = a[k] ;
+    if (op >= 0) {
+      if (d[n*p+k]) {
+        int oc = cm[op] ;
+        if (oc == c) {
+          return 0 ;
+        }
+      }
+    }
+  }
+  return 1 ;
+}
+
+static int ok_color(int *cm,int *d,int *a,int n, int *q, int c) {
+  for ( ; *q >= 0 ; q++) {
+    if (!ok_one_color(cm,d,a,n,*q,c)) return 0 ;
+  }
+  return 1 ;
+}
+
+static int find_color_diff
+(int prev,st_t *st,int *cm,mapcore_t *mc,int *d, int *a,int n, int *q) {
+  int sz = get_node_sz(q) ;
+  int k0 = prev >= 0 && rand_bit(st) ? prev : rand_k(st,mc->ncores) ;
+  int k = k0 ;
+  do {
+    cpus_t *p = mc->core[k] ;
+    if (p->sz >= sz && ok_color(cm,d,a,n,q,k)) return k ;
+    k++ ; k %= mc->ncores ;
+  } while (k != k0) ;
+  return -1 ;
+}
+
+
+static int find_one_proc
+(int prev,st_t *st,int *cm,mapcore_t *mc,int *d,int *a,int n,int p) {
+  int found = -1 ;
+  int k0 = prev >= 0 && rand_bit(st) ? prev : rand_k(st,mc->ncores) ;
+  int k = k0 ;
+  do {
+    cpus_t *pk = mc->core[k] ;
+    if (pk->sz > 0) {
+      if (found < 0) found = k ;
+      if (ok_one_color(cm,d,a,n,p,k)) return k ;
+    }
+    k++ ; k %= mc->ncores ;
+  } while (k != k0) ;
+  if (found < 0) fatal("Cannot allocate threads") ;
+  return found ;
+}
+
+void custom_affinity (st_t *st,cpus_t *cm,int **color,int *diff,cpus_t *aff_cpus,int n_exe, int *r) {
+  mapcore_t *mc = inverse_procs(cm,aff_cpus) ;
+  int n = get_n(color) ;
+  /* Diff relation as matrix */
+  int d[n*n] ;
+  {
+    int *q = diff ;
+    for (int k = 0 ; k < n*n ; k++) d[k] = 0 ;
+    while (*q >= 0) {
+      int x = *q++, y = *q++ ;
+      d[n*x+y] = d[n*y+x] = 1 ;
+    }
+  }
+  for (int k = 0 ; k < n_exe ; k++) {
+    int *a = &r[k*n] ;
+    int prev_core = -1 ;
+    for (int i = 0 ; i < n ; i++) a[i] = -1 ;
+    for (int **q = color ; *q ; q++) {
+      int c = find_color_diff(prev_core,st,aff_cpus->cpu,mc,d,a,n,*q) ;
+      if (c >= 0) {
+        cpus_t *p = mc->core[c] ;
+        for (int *qq = *q ; *qq >= 0 ; qq++) {
+          p->sz-- ;
+          a[*qq] = p->cpu[p->sz] ;
+        }
+        prev_core = c ;
+      } else {
+        for (int *qq = *q ; *qq >= 0 ; qq++) {
+          int c = find_one_proc(prev_core,st,aff_cpus->cpu,mc,d,a,n,*qq) ;
+          cpus_t *p = mc->core[c] ;
+          p->sz-- ;
+          a[*qq] = p->cpu[p->sz] ;
+          prev_core = c ;
+        }
+      }
+    }
+  }
+  mapcore_free(mc) ;
+}
+
+/****************/
+/* Command line */
+/****************/
+
+/* usage */
+
+static void usage(char *prog, cmd_t *d) {
+  log_error("usage: %s (options)*\n",prog) ;
+  log_error("  -v      be verbose\n") ;
+  log_error("  -q      be quiet\n") ;
+  log_error("  -a <n>  run maximal number of tests for n available processors (default %i)\n",d->avail) ;
+  log_error("  -n <n>  run n tests concurrently\n") ;
+  log_error("  -r <n>  perform n runs (default %i)\n",d->max_run) ;
+  log_error("  -fr <f> multiply run number per f\n") ;
+  log_error("  -s <n>  outcomes per run (default %i)\n",d->size_of_test) ;
+  if (d->stride > 0) {
+    log_error("  -st <n> stride (default %i)\n",d->stride) ;
+  }
+  log_error("  -fs <f> multiply outcomes per f\n") ;
+  log_error("  -f <f>  multiply outcomes per f, divide run number by f\n") ;
+  if (d->aff_mode != aff_none) {
+    log_error("  -i <n>  increment for allocating logical processors, -i 0 disables affinity mode") ;
+    if (d->aff_mode == aff_incr) {
+      log_error(" (default %i)\n",d->aff_incr) ;
+    } else {
+      log_error("\n") ;
+    }
+    log_error("  -p <ns> specify logical processors (default '") ;
+    cpus_dump(errlog,d->aff_cpus) ;
+    log_error("')\n") ;
+    log_error("  +ra     randomise affinity%s\n",d->aff_mode == aff_random ? " (default)" : "") ;
+    if (d->aff_custom_enabled) {
+      log_error("  +ca     enable custom affinity%s\n",d->aff_mode == aff_custom ? " (default)" : "") ;
+    } else {
+      log_error("  +ca     alias for +ra\n") ;
+    }
+    if (d->aff_scan_enabled) {
+      log_error("  +sa     enable scanning affinity%s\n",d->aff_mode == aff_scan ? " (default)" : "") ;
+      log_error("  +ta <topo> set topology affinity\n") ;
+    } else {
+      log_error("  +sa     alias for +ra\n") ;
+    }
+  }
+  if (d->shuffle >= 0) {
+    log_error("  +rm     randomise memory accesses%s\n",d->shuffle ? " (default)" : "") ;
+    log_error("  -rm     do not randomise memory accesses%s\n",!d->shuffle ? " (default)" : "") ;
+  }
+  if (d->speedcheck >= 0) {
+    log_error("  +sc     stop as soon as possible%s\n",d->speedcheck ? " (default)" : "") ;
+    log_error("  -sc     run test completly%s\n",!d->speedcheck ? " (default)" : "") ;
+  }
+  if (!d->fix) {
+    log_error("  +fix    fix thread launch order\n") ;
+  }
+  if (d->delta_tb) {
+    log_error("  -tb <list> set timebase delays, default '") ;
+    ints_dump(errlog,d->delta_tb) ;
+    log_error("'\n") ;
+    log_error("    List syntax is comma separated proc:delay\n") ;
+    log_error("  -ta <n>    set all timebase delays\n") ;
+  }
+  if (d->verbose_barrier >= 0) {
+    log_error("  +vb     show iteration timings%s\n",d->verbose_barrier ? " (default)" : "") ;
+    log_error("  -vb     do not show iteration timings%s\n",!d->verbose_barrier ? " (default)" : "") ;
+  }
+  if (d->prefetch) {
+    log_error("  -pra (I|F|T|W) set all prefetch\n") ;
+    log_error("  -prf <list> set prefetch, default '") ;
+    prefetch_dump(errlog,d->prefetch) ;
+    log_error("'\n") ;
+    log_error("    List syntax is comma separated proc:name=(I|F|T|W)\n") ;
+  }
+  if (d->static_prefetch >= 0) {
+    log_error("  -prs <n> prefetch probability is 1/n, -prs 0 disables feature, default %i\n",d->static_prefetch) ;
+  }
+  if (d->max_loop > 0) {
+    log_error("  -l <n>  measure time by running assembly in a loop of size <n> (default %i)\n",d->max_loop) ;
+  }
+  if (d->prelude > 0) {
+    log_error("  -vp     no verbose prelude\n") ;
+  }
+  if (d->sync_n > 0) {
+    log_error("  -k <n>  undocumented (default %i)\n",d->sync_n) ;
+  }
+  exit(2) ;
+}
+
+static long my_add (long x, long y) {
+  long r = x+y ;
+  if (r < x || r < y) { errno = ERANGE ; fatal("overflow") ; }
+  return r ;
+}
+
+static long my_pow10(int p,long x) {
+  long r = x ;
+  for ( ; p > 0 ; p--) {
+    long y2 = my_add(r,r) ;
+    long y4 = my_add(y2,y2) ;
+    long y8 = my_add(y4,y4) ;
+    r = my_add(y8,y2) ;
+  }
+  if (r >= INT_MAX || r <= 0) {  errno = ERANGE ; fatal("overflow") ; }
+  return r ;
+}
+
+static int do_argint(char *p, char **q) {
+  long r =  strtol(p,q,10) ;
+  if (errno == ERANGE) { fatal("overflow") ; }
+  if (**q == 'k' || **q == 'K') { r = my_pow10(3,r) ; *q += 1; }
+  else if (**q == 'm' || **q == 'M') { r = my_pow10(6,r) ; *q +=1 ; }
+  return (int)r ;
+}
+
+static int argint(char *prog,char *p,cmd_t *d) {
+  char *q ;
+  long r = do_argint(p,&q) ;
+  if (*p == '\0' || *q != '\0') {
+    usage(prog,d) ;
+  }
+  return (int)r ;
+}
+
+static cpus_t *argcpus(char *prog,char *p0,cmd_t *d) {
+  int sz = 0 ;
+  char *p ;
+
+  p = p0 ;
+  for ( ; ; ) {
+    char *q ;
+    int x = (int)strtol(p,&q,10) ;
+    if (x < 0 || *p == '\0' || (*q != '\0' && *q != ','))  usage(prog,d) ;
+    sz++ ;
+    if (*q == '\0') break ;
+    p = q+1 ;
+  }
+  cpus_t *r = cpus_create(sz) ;
+  p = p0 ;
+  for (int k = 0 ; k < sz ; k++) {
+    char *q ;
+    r->cpu[k] = (int)strtol(p,&q,10) ;
+    p = q+1 ;
+  }
+  return r ;
+}
+
+static void argints(char *prog,cmd_t *d, char *p,ints_t *r) {
+  while (*p) {
+    char *q ;
+    int idx = (int)strtol(p,&q,10) ;
+    if (idx < 0 || idx >= r->sz || *p == '\0' || *q != ':')  usage(prog,d) ;
+    p = q+1 ;
+    int v = do_argint(p,&q) ;
+    if (*p == '\0' || (*q != '\0' && *q != ','))  usage(prog,d) ;
+    r->t[idx] = v ;
+    if (*q == '\0') {
+      p = q ;
+    } else {
+      p = q+1 ;
+    }
+  }
+}
+
+static prfone_t *get_name_slot(prfproc_t *p,char *name) {
+  int nvars = p->nvars ;
+  prfone_t *q = p->t ;
+  for (int _v = 0 ; _v < nvars ; _v++) {
+    if (strcmp(name,q[_v].name) == 0) return &q[_v] ;
+  }
+  return NULL ; /* Name not found */
+}
+
+
+static void argoneprefetch(char *prog,cmd_t *d, char *p, prfdirs_t *r) {
+  prfdir_t dir = none ;
+  switch (*p) {
+  case 'F':
+    dir = flush ;
+    break ;
+  case 'T':
+    dir = touch ;
+    break ;
+  case 'W':
+    dir = touch_store ;
+    break ;
+  }
+  set_prefetch(r,dir) ;
+}
+
+int parse_prefetch(char *p, prfdirs_t *r) {
+  if (!*p) return 1 ;
+  for ( ;; ) {
+    char *q ;
+    int proc = (int)strtol(p,&q,10) ;
+    if (proc < 0 || proc >= r->nthreads || *p == '\0' || *q != ':')
+      return 0 ;
+    p = q+1 ;
+    char *p0 = p ;
+    while (*p != '=') {
+      if (*p == '\0') return 0 ;
+      p++ ;
+    }
+    *p = '\0' ;
+    prfone_t *loc_slot = get_name_slot(&r->t[proc],p0) ;
+    if (loc_slot == NULL) {
+      log_error("Proc %i does not access variable %s\n",proc,p0) ;
+      *p = '=' ;
+      return 0 ;
+    }
+    *p = '=' ;
+    char c = *++p;
+    prfdir_t dir = none ;
+    switch (c) {
+    case 'F':
+      dir = flush ;
+      break ;
+    case 'T':
+      dir = touch ;
+      break ;
+    case 'W':
+      dir = touch_store ;
+      break ;
+    }
+    loc_slot->dir = dir ;
+    c = *++p ;
+    if (c == '\0') return 1 ;
+    else if (c == ',') p++ ;
+    else return 0 ;
+  }
+}
+
+static void argprefetch(char *prog,cmd_t *d, char *p, prfdirs_t *r) {
+  if (!parse_prefetch(p,r)) usage(prog,d) ;
+}
+
+static double argdouble(char *prog,char *p,cmd_t *d) {
+  char *q ;
+  double r = strtod(p,&q) ;
+  if (*p == '\0' || *q != '\0') {
+    usage(prog,d) ;
+  }
+  return r ;
+}
+
+void parse_cmd(int argc, char **argv, cmd_t *d, cmd_t *p) {
+  char *prog = argv[0] ;
+
+  /* Options */
+  for ( ; ; ) {
+    --argc ; ++argv ;
+    if (!*argv) break ;
+    char fst = **argv ;
+    if (fst != '-' && fst != '+') break ;
+    if (strcmp(*argv,"-q") == 0) p->verbose=0 ;
+    else if (strcmp(*argv,"-v") == 0) p->verbose++ ;
+    else if (strcmp(*argv,"-r") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      p->max_run = argint(prog,argv[0],d) ;
+    } else if (strcmp(*argv,"-fr") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      p->max_run *= argdouble(prog,argv[0],d) ;
+    } else if (strcmp(*argv,"-s") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      p->size_of_test = argint(prog,argv[0],d) ;
+    } else if (d->stride > 0 && strcmp(*argv,"-st") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      p->stride = argint(prog,argv[0],d) ;
+      if (p->stride <= 0) p->stride = 1 ;
+    } else if (strcmp(*argv,"-fs") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      p->size_of_test *= argdouble(prog,argv[0],d) ;
+    } else if (strcmp(*argv,"-f") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      double f = argdouble(prog,argv[0],d) ;
+      p->size_of_test *= f ;
+      p->max_run /= f ;
+    } else if (strcmp(*argv,"-n") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      p->n_exe = argint(prog,argv[0],d) ;
+      if (p->n_exe < 1) p->n_exe = 1 ;
+    } else if (strcmp(*argv,"-a") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      int a = argint(prog,argv[0],d) ;
+      p->avail = a ;
+    } else if (d->sync_n > 0 && strcmp(*argv,"-k") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      int a = argint(prog,argv[0],d) ;
+      p->sync_n = a < 0 ? 0 : a ;
+    } else if (d->aff_mode != aff_none && strcmp(*argv,"-i") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      int i = argint(prog,argv[0],d) ;
+      p->aff_mode = aff_incr ;
+      p->aff_incr = i < 0 ? 0 : i ;
+    } else if (d->aff_mode != aff_none && strcmp(*argv,"-p") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      cpus_t *cpus = argcpus(prog,argv[0],d) ;
+      p->aff_cpus = cpus ;
+    } else if (d->aff_mode != aff_none && strcmp(*argv,"+ra") == 0) {
+      p->aff_mode = aff_random ;
+    } else if (d->aff_custom_enabled && strcmp(*argv,"+ca") == 0) {
+      p->aff_mode = aff_custom ;
+    } else if (d->aff_mode != aff_none && strcmp(*argv,"+ca") == 0) {
+      p->aff_mode = aff_random ;
+    } else if (d->aff_scan_enabled && strcmp(*argv,"+sa") == 0) {
+      p->aff_mode = aff_scan ;
+    } else if (d->aff_mode != aff_none && strcmp(*argv,"+sa") == 0) {
+      p->aff_mode = aff_random ;
+    } else if (d->aff_scan_enabled && strcmp(*argv,"+ta") == 0) {
+      p->aff_mode = aff_topo ;
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      p->aff_topo = argv[0] ;
+    } else if (d->aff_mode != aff_none && strcmp(*argv,"+sa") == 0) {
+      p->aff_mode = aff_random ;
+    } else if (d->shuffle >= 0 && strcmp(*argv,"+rm") == 0) {
+      p->shuffle = 1 ;
+    } else if (d->shuffle >= 0 && strcmp(*argv,"-rm") == 0) {
+      p->shuffle = 0 ;
+    } else if (d->speedcheck >= 0 && strcmp(*argv,"+sc") == 0) {
+      p->speedcheck = 1 ;
+    } else if (d->speedcheck >= 0 && strcmp(*argv,"-sc") == 0) {
+      p->speedcheck = 0 ;
+    } else if (!d->fix &&  strcmp(*argv,"+fix") == 0) {
+      p->fix = 1 ;
+    } else if (d->verbose_barrier >= 0 && strcmp(*argv,"+vb") == 0) {
+      p->verbose_barrier++ ;
+    } else if (d->verbose_barrier >= 0 && strcmp(*argv,"-vb") == 0) {
+      p->verbose_barrier = 0 ;
+    } else if (d->prelude > 0 && strcmp(*argv,"-vp") == 0) {
+      p->prelude = 0 ;
+    } else if (d->delta_tb &&  strcmp(*argv,"-tb") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      argints(prog,d,argv[0],p->delta_tb) ;
+    } else if (d->delta_tb &&  strcmp(*argv,"-ta") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      int da = argint(prog,argv[0],d) ;
+      for (int k = 0 ; k < p->delta_tb->sz ; k++) p->delta_tb->t[k] = da ;
+    } else if (d->prefetch && strcmp(*argv,"-prf") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      argprefetch(prog,d,argv[0],p->prefetch) ;
+    } else if (d->prefetch && strcmp(*argv,"-pra") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      argoneprefetch(prog,d,argv[0],p->prefetch) ;
+    } else  if (d->static_prefetch >= 0 &&  strcmp(*argv,"-prs") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      int prs = argint(prog,argv[0],d) ;
+      p->static_prefetch = prs >= 0 ? prs : 0 ;
+    } else if (d->max_loop > 0 && strcmp(*argv,"-l") == 0) {
+      --argc ; ++argv ;
+      if (!*argv) usage(prog,d) ;
+      int i = argint(prog,argv[0],d) ;
+      p->max_loop = i < 1 ? 1 : i ;
+    } else usage(prog,d) ;
+  }
+
+  /* Argument */
+  if (argc == 0) return ;
+  usage(prog,d) ;
+}
+
+/*************************/
+/* Concurrency utilities */
+/*************************/
+
+/* phread based mutex */
+
+pm_t *pm_create(void) {
+  pm_t *p = malloc_check(sizeof(*p)) ;
+  int ret = pthread_mutex_init(p,NULL) ;
+  if (ret) { errexit("mutex_init",ret) ; }
+  return p ;
+}
+
+void pm_free(pm_t *p) {
+  free(p) ;
+}
+
+void pm_lock(pm_t *m) {
+  int ret = pthread_mutex_lock(m) ;
+  if (ret) { errexit("mutex_lock",ret) ; }
+}
+
+void pm_unlock(pm_t *m) {
+  int ret = pthread_mutex_unlock(m) ;
+  if (ret) { errexit("mutex_unlock",ret) ; }
+}
+
+/* phread condition */
+
+pc_t *pc_create(void) {
+  pc_t *p = malloc_check(sizeof(*p)) ;
+  p->c_mutex = pm_create() ;
+  p->c_cond = malloc_check(sizeof(*(p->c_cond))) ;
+  int e = pthread_cond_init(p->c_cond,NULL) ;
+  if (e) { errexit("cond_init",e); }
+  return p ;
+}
+
+void pc_free(pc_t *p) {
+  pm_free(p->c_mutex) ;
+  free(p->c_cond) ;
+  free(p) ;
+}
+
+static void pc_lock(pc_t *p) {
+  pm_lock(p->c_mutex) ;
+}
+
+static void pc_unlock(pc_t *p) {
+  pm_unlock(p->c_mutex) ;
+}
+
+void pc_wait(pc_t *p) {
+  int e = pthread_cond_wait(p->c_cond, p->c_mutex) ;
+  if (e) { errexit("cond_wait",e) ; }
+}
+
+void pc_broadcast (pc_t *p) {
+  int e = pthread_cond_broadcast(p->c_cond) ;
+  if (e) { errexit("cond_broadcast",e) ; }
+}
+
+static void pc_signal(pc_t *p) {
+  int e = pthread_cond_signal(p->c_cond);
+  if (e) errexit("cond_signal",e) ;
+}
+
+
+/* pthread based barrier, usable for nproc threads */
+
+
+pb_t *pb_create(int nprocs) {
+  pb_t *p = malloc_check(sizeof(*p)) ;
+  p->cond = pc_create() ;
+  p->count = p->nprocs = nprocs ;
+  p->turn = 0 ;
+  return p ;
+}
+
+void pb_free(pb_t *p) {
+  pc_free(p->cond) ;
+  free(p) ;
+}
+
+/* The following code should protect us against spurious wake ups */
+void pb_wait(pb_t *p) {
+  pc_lock(p->cond) ;
+  int t = p->turn ;
+  --p->count ;
+  if (p->count == 0) {
+    p->count = p->nprocs ;
+    p->turn = !t ;
+    pc_broadcast(p->cond) ;
+  } else {
+    do {
+      pc_wait(p->cond) ;
+    } while (p->turn == t) ;
+  }
+  pc_unlock(p->cond) ;
+}
+
+
+/* pthread based or flag */
+
+po_t *po_create(int nprocs) {
+  po_t *p = malloc_check(sizeof(*p)) ;
+  p->cond = pc_create() ;
+  p->nprocs = p->count = nprocs ;
+  p->val = 0 ;
+  p->turn = 0 ;
+  return p ;
+}
+
+void po_free(po_t *p) {
+  pc_free(p->cond) ;
+  free(p) ;
+}
+
+void po_reinit(po_t *p) {
+  pc_lock(p->cond) ;
+  int t = p->turn ;
+  --p->count ;
+  if (p->count == 0) {
+    p->count = p->nprocs ;
+    p->val = 0 ;
+    p->turn = !t ;
+    pc_broadcast(p->cond) ;
+  } else {
+    do {
+      pc_wait(p->cond) ;
+    } while (p->turn == t) ;
+  }
+  pc_unlock(p->cond) ;
+}
+
+int po_wait(po_t *p, int v) {
+  pc_lock(p->cond) ;
+  int t = p->turn ;
+  --p->count ;
+  p->val = p->val || v ;
+  if (p->count == 0) {
+    p->count = p->nprocs ;
+    p->turn = !t ;
+    pc_broadcast(p->cond) ;
+  } else {
+    do {
+      pc_wait(p->cond) ;
+    } while (p->turn == t) ;
+  }
+  int r = p->val ;
+  pc_unlock(p->cond) ;
+  return r ;
+}
+
+
+/* One place buffer */
+
+op_t *op_create(void) {
+  op_t *p = malloc_check(sizeof(*p)) ;
+  p->cond = pc_create() ;
+  p->val = NULL ;
+  p->some = 0 ;
+  return p;
+}
+
+void op_free(op_t *p) {
+  pc_free(p->cond) ;
+  free(p) ;
+}
+
+void op_set(op_t *p, void *v) {
+  pc_lock(p->cond) ;
+  if (p->some) { fatal("op_set") ; }
+  p->val = v ;
+  p->some = 1 ;
+  pc_signal(p->cond) ;
+  pc_unlock(p->cond) ;
+}
+
+void *op_get(op_t *p) {
+  void *v = NULL ;
+  pc_lock(p->cond) ;
+  while (!p->some) {
+    pc_wait(p->cond) ;
+  }
+  v = (void *) p->val ;
+  p->val = NULL ;
+  p->some = 0 ;
+  pc_unlock(p->cond) ;
+  return v ;
+}
+
+/* Thread launch and join */
+
+void launch(pthread_t *th, f_t *f, void *a) {
+  int e = pthread_create(th,NULL,f,a);
+  if (e) errexit("phread_create",e);
+}
+
+void *join(pthread_t *th) {
+  void *r ;
+  int e = pthread_join(*th,&r) ;
+  if (e)  errexit("pthread_join",e);
+  return r ;
+}
+
+/* Detached */
+
+typedef struct {
+  f_t *f;
+  void *a ;
+  op_t *op;
+} detarg_t ;
+
+static void *zyva_det(void *_b) {
+  detarg_t *b = (detarg_t *)_b;
+  f_t *f = b->f ;
+  void *a = b->a ;
+  op_t *op = b->op ;
+  free(b) ;
+  int e = pthread_detach(pthread_self());
+  if (e) errexit("pthread_detach",e) ;
+  void *r = f(a) ;
+  op_set(op,r) ;
+  return NULL ;
+}
+
+op_t *launch_detached(f_t *f,void *a) {
+  op_t *op = op_create() ;
+  detarg_t *b = malloc_check(sizeof(*b)) ;
+  b->f = f ; b->a = a; b->op = op ;
+  pthread_t th ;
+  launch(&th,zyva_det,b) ;
+  return op ;
+}
+
+void *join_detached(op_t *op) {
+  void *r = op_get(op) ;
+  op_free(op) ;
+  return r ;
+}
+
+/* Thread cache */
+
+void *start_thread(void *_a) {
+  sarg_t *_b = (sarg_t *)_a ;
+  for (int _k = _b->max_run ; _k > 0 ; _k--) {
+    void *_c = op_get(_b->op_arg) ;
+    f_t *f = (f_t *)_c ;
+    if (f == NULL) break ;
+    void *ret = f(_b->arg) ;
+    op_set(_b->op_ret,ret) ;
+  }
+  return NULL ;
+}
+
+/*****************/
+/* Random things */
+/*****************/
+
+void perm_prefix_ints(unsigned *st,int *_t, int m, int n) {
+  int k;
+  for (k = 0 ; k < m ; k++) {
+    int j = k+rand_k(st,n-k);
+    int x = _t[k]; _t[k] = _t[j]; _t[j] = x;
+  }
+}
+
+void perm_ints(unsigned *st,int *_t, int n) {
+  perm_prefix_ints(st, _t,n-1,n) ;
+}
+
+void perm_funs(unsigned *st,f_t *fun[], int n) {
+  int k;
+  for (k = 0 ; k < n-1 ; k++) {
+    int j = k+rand_k(st,n-k);
+    f_t *t = fun[j];
+    fun[j] = fun[k]; fun[k] = t;
+  }
+}
+
+void perm_ops(unsigned *st,op_t *op[], int n) {
+  int k;
+  for (k = 0 ; k < n-1 ; k++) {
+    int j = k+rand_k(st,n-k);
+    op_t *t = op[j];
+    op[j] = op[k]; op[k] = t;
+  }
+}
+
+void perm_threads(unsigned *st,pthread_t thread[], int n) {
+  int k;
+  for (k = 0 ; k < n-1 ; k++) {
+    int j = k+rand_k(st,n-k);
+    pthread_t t = thread[j];
+    thread[j] = thread[k]; thread[k] = t;
+  }
+}
+
+static int int_cmp(const void *_p, const void *_q) {
+  int x = *((int *)_p) ;
+  int y = *((int *)_q) ;
+  if (x < y) return -1 ;
+  else if (x > y) return 1 ;
+  else return 0 ;
+}
+
+int check_shuffle(int **t, int *min, int sz) {
+  int *idx = malloc_check(sizeof(*idx)*sz) ;
+  for (int k=0 ; k < sz ; k++) {
+    idx[k] = (int)(t[k] - min) ;
+    //    fprintf(stderr," %i",idx[k]) ;
+  }
+  //  fprintf(stderr,"\n") ;
+  qsort(&idx[0],sz, sizeof(idx[0]), int_cmp) ;
+  for (int k=0 ; k < sz ; k++) {
+    if (idx[k] != k) {
+      free(idx) ;
+      return 0 ;
+    }
+  }
+  free(idx) ;
+  return 1 ;
+}
+
+/****************/
+/* Time counter */
+/****************/
+
+#include <sys/time.h>
+#include <time.h>
+
+tsc_t timeofday(void) {
+  struct timeval tv ;
+  if (gettimeofday(&tv,NULL)) errexit("gettimeoday",errno) ;
+  return tv.tv_sec * ((tsc_t)1000000) + tv.tv_usec ;
+}
+
+double tsc_ratio(tsc_t t1, tsc_t t2) {
+  return ((double) t1) / ((double)t2) ;
+}
+
+
+double tsc_millions(tsc_t t) {
+  return t / 1000000.0 ;
+}
+
+/*******************/
+/* String handling */
+/*******************/
+
+int find_string(char *t[], int sz, char *s) {
+  for (int k = 0 ; k < sz ; k++) {
+    if (strcmp(t[k],s) == 0) return k ;
+  }
+  return -1 ;
+}
diff --git a/tests/tcg/mttcg/aarch64/utils.h b/tests/tcg/mttcg/aarch64/utils.h
new file mode 100644
index 0000000..99e756e
--- /dev/null
+++ b/tests/tcg/mttcg/aarch64/utils.h
@@ -0,0 +1,275 @@ 
+/****************************************************************************/
+/*                           the diy toolsuite                              */
+/*                                                                          */
+/* Jade Alglave, University College London, UK.                             */
+/* Luc Maranget, INRIA Paris-Rocquencourt, France.                          */
+/*                                                                          */
+/* Copyright 2015-present Institut National de Recherche en Informatique et */
+/* en Automatique and the authors. All rights reserved.                     */
+/*                                                                          */
+/* This software is governed by the CeCILL-B license under French law and   */
+/* abiding by the rules of distribution of free software. You can use,      */
+/* modify and/ or redistribute the software under the terms of the CeCILL-B */
+/* license as circulated by CEA, CNRS and INRIA at the following URL        */
+/* "http://www.cecill.info". We also give a copy in LICENSE.txt.            */
+/****************************************************************************/
+#ifndef _UTILS_H
+#define _UTILS_H 1
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include "litmus_rand.h"
+
+
+/********/
+/* Misc */
+/********/
+
+void seterrlog(FILE *chan) ;
+
+int log_error(const char *fmt,...) ;
+
+void fatal(char *msg) ;
+/* e is errno */
+void errexit(char *msg,int e) ;
+
+void *malloc_check(size_t sz) ;
+
+int max(int n,int m) ;
+
+void pp_ints (FILE *fp,int *p,int n) ;
+
+void *do_align(void *p, size_t sz) ;
+
+void *do_noalign(void *p, size_t sz) ;
+
+void cat_file(char *path,char *msg,FILE *out) ;
+
+/***********/
+/* CPU set */
+/***********/
+
+#define CPUS_DEFINED 1
+typedef struct {
+  int sz ;
+  int *cpu ;
+} cpus_t ;
+
+cpus_t *cpus_create(int sz) ;
+cpus_t *cpus_create_init(int sz, int t[]) ;
+void cpus_free(cpus_t *p) ;
+void cpus_dump(FILE *fp, cpus_t *p) ;
+void cpus_dump_test(FILE *fp, int *p, int sz, cpus_t *cm,int nprocs) ;
+
+int gcd(int a, int b) ;
+
+cpus_t *coremap_seq(int navail, int nways) ;
+cpus_t *coremap_end(int navail, int nways) ;
+
+void custom_affinity
+(st_t *st,cpus_t *cm,int **color,int *diff,cpus_t *aff_cpus,int n_exe, int *r) ;
+
+/*************/
+/* Int array */
+/*************/
+
+typedef struct {
+  int sz ;
+  int *t ;
+} ints_t ;
+
+void ints_dump(FILE *fp, ints_t *p) ;
+
+/* Prefetch directives */
+typedef enum {none, flush, touch, touch_store} prfdir_t ;
+
+typedef struct {
+  char *name ;
+  prfdir_t dir ;
+} prfone_t ;
+
+typedef struct {
+  int nvars ;
+  prfone_t *t ;
+} prfproc_t ;
+
+typedef struct {
+  int nthreads ;
+  prfproc_t *t ;
+} prfdirs_t ;
+
+void prefetch_dump(FILE *fp, prfdirs_t *p) ;
+int parse_prefetch(char *p, prfdirs_t *r) ;
+
+/************************/
+/* Command line options */
+/************************/
+typedef enum
+  { aff_none, aff_incr, aff_random, aff_custom,
+    aff_scan, aff_topo} aff_mode_t ;
+
+typedef struct {
+  int verbose ;
+  /* Test parmeters */
+  int max_run ;
+  int size_of_test ;
+  int stride ;
+  int avail ;
+  int n_exe ;
+  int sync_n ;
+  /* Affinity */
+  aff_mode_t aff_mode ;
+  int aff_custom_enabled ;
+  int aff_scan_enabled ;
+  int aff_incr ;
+  cpus_t *aff_cpus ;
+  char *aff_topo ;
+  /* indirect mode */
+  int shuffle ;
+  /* loop test */
+  int max_loop ;
+  /* time base delays */
+  ints_t * delta_tb ;
+  /* prefetch control */
+  prfdirs_t *prefetch ;
+  int static_prefetch ;
+  /* show time of synchronisation */
+  int verbose_barrier ;
+  /* Stop as soon as condition is settled */
+  int speedcheck ;
+  /* Enforce fixed launch order (ie cancel change lauch) */
+  int fix ;
+  /* Dump prelude to test output */
+  int prelude ;
+} cmd_t ;
+
+void parse_cmd(int argc, char **argv, cmd_t *def, cmd_t *p) ;
+
+
+/********************/
+/* Thread utilities */
+/********************/
+
+/* Mutex */
+
+typedef pthread_mutex_t pm_t ;
+
+pm_t *pm_create(void) ;
+void pm_free(pm_t *p) ;
+void pm_lock(pm_t *m) ;
+void pm_unlock(pm_t *m) ;
+
+/* Condition variable */
+
+typedef struct {
+  pm_t *c_mutex ;
+  pthread_cond_t *c_cond ;
+} pc_t ;
+
+pc_t *pc_create(void) ;
+void pc_free(pc_t *p) ;
+void pc_wait(pc_t *p) ;
+void pc_broadcast (pc_t *p) ;
+
+/* Barrier */
+
+/* Avoid pthread supplied barrier as they are not available in old versions */
+
+typedef struct {
+  volatile unsigned int count ;
+  volatile int turn ;
+  pc_t *cond ;
+  unsigned int nprocs ;
+} pb_t ;
+
+
+pb_t *pb_create(int nprocs) ;
+void pb_free(pb_t *p) ;
+void pb_wait(pb_t *p) ;
+
+
+/* Or flag */
+
+typedef struct {
+  pc_t *cond ;
+  int nprocs ;
+  int count ;
+  volatile int val ;
+  volatile int turn ;
+} po_t ;
+
+po_t *po_create(int nprocs) ;
+void po_free(po_t *p) ;
+/* Initialize flag, must be called by all participant */
+void po_reinit(po_t *p) ;
+/* Return the 'or' of the v arguments of all participants */
+int po_wait(po_t *p, int v) ;
+
+/* One place buffer */
+
+typedef struct {
+  pc_t *cond ;
+  int volatile some ;
+  void * volatile val ;
+} op_t ;
+
+op_t *op_create(void) ;
+void op_free(op_t *p) ;
+void op_set(op_t *p, void *v) ;
+void *op_get(op_t *p) ;
+
+/* Thread launch and join */
+
+typedef void* f_t(void *);
+
+void launch(pthread_t *th, f_t *f, void *a) ;
+
+void *join(pthread_t *th) ;
+
+/* Detached lauch and join */
+
+op_t *launch_detached(f_t *f,void *a) ;
+void *join_detached(op_t *p) ;
+
+/* Thread cache */
+
+typedef struct {
+  int max_run ;
+  op_t *op_arg,*op_ret ;
+  void *arg ;
+} sarg_t ;
+
+f_t start_thread ;
+
+/*****************/
+/* Random things */
+/*****************/
+
+/* permutations */
+
+void perm_prefix_ints(st_t *st,int t[], int used, int sz) ;
+void perm_ints(st_t *st,int t[], int sz) ;
+void perm_funs(st_t *st,f_t *t[], int sz) ;
+void perm_threads(st_t *st,pthread_t t[], int sz) ;
+void perm_ops(st_t *st,op_t *t[], int sz) ;
+
+/* check permutation */
+int check_shuffle(int **t, int *min, int sz) ;
+
+/*********************/
+/* Real time counter */
+/*********************/
+
+typedef unsigned long long tsc_t ;
+#define PTSC "%llu"
+
+/* Result in micro-seconds */
+tsc_t timeofday(void) ;
+double tsc_ratio(tsc_t t1, tsc_t t2) ;
+double tsc_millions(tsc_t t) ;
+
+/* String utilities */
+int find_string(char *t[],int sz,char *s) ;
+
+#endif