Message ID | 20160618040343.19517-2-bobby.prani@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 18/06/16 07:03, Pranith Kumar wrote: > diff --git a/tcg/tcg.h b/tcg/tcg.h > index db6a062..36feca9 100644 > --- a/tcg/tcg.h > +++ b/tcg/tcg.h > @@ -408,6 +408,20 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) > #define TCG_CALL_DUMMY_TCGV MAKE_TCGV_I32(-1) > #define TCG_CALL_DUMMY_ARG ((TCGArg)(-1)) > > +typedef enum { > + TCG_MO_LD_LD = 1, > + TCG_MO_ST_LD = 2, > + TCG_MO_LD_ST = 4, > + TCG_MO_ST_ST = 8, > + TCG_MO_ALL = 0xF, // OR of all above So TCG_MO_ALL specifies a so called "full" memory barrier? > +} TCGOrder; > + > +typedef enum { > + TCG_BAR_ACQ = 32, > + TCG_BAR_REL = 64, I'm convinced that the only practical way to represent a standalone acquire memory barrier is to order all previous loads with all subsequent loads and stores. Similarly, a standalone release memory barrier would order all previous loads and stores with all subsequent stores. [1] On the other hand, acquire or release semantic associated with a memory operation itself can be directly mapped into e.g. AArch64's Load-Acquire (LDAR) and Store-Release (STLR) instructions. A standalone barrier adjacent to a memory operation shouldn't be mapped this way because it should provide more strict guarantees than e.g. AArch64 instructions mentioned above. Therefore, I advocate for clear distinction between standalone memory barriers and implicit memory ordering semantics associated with memory operations themselves. [1] http://preshing.com/20130922/acquire-and-release-fences/ > + TCG_BAR_SC = 128, How's that different from TCG_MO_ALL? > +} TCGBar; > + > /* Conditions. Note that these are laid out for easy manipulation by > the functions below: > bit 0 is used for inverting; Kind regards, Sergey
On 18/06/2016 06:03, Pranith Kumar wrote: > +typedef enum { > + TCG_MO_LD_LD = 1, > + TCG_MO_ST_LD = 2, > + TCG_MO_LD_ST = 4, > + TCG_MO_ST_ST = 8, I like the idea of making this a bitmask. However, most of the code you wrote for the backends looks at these as an enum. For example, +static inline void tcg_out_mb(TCGContext *s, TCGArg a0) +{ + switch (a0 & TCG_MO_ALL) { + case TCG_MO_LD_LD: + tcg_out32(s, DMB_ISH | DMB_LD); + break; + case TCG_MO_ST_ST: + tcg_out32(s, DMB_ISH | DMB_ST); + break; + default: + tcg_out32(s, DMB_ISH | DMB_LD | DMB_ST); + break; + } +} should rather be if (a0 & (ST_LD|LD_ST)) { output dmb ish return } if (a0 & LD_LD) { output dmb ishld } if (a0 & LD_ST) { output dmb ishst } Paolo > + TCG_MO_ALL = 0xF, // OR of all above > +} TCGOrder; > + > +typedef enum { > + TCG_BAR_ACQ = 32, > + TCG_BAR_REL = 64, > + TCG_BAR_SC = 128, > +} TCGBar;
Hi Sergey, On Mon, Jun 20, 2016 at 5:21 PM, Sergey Fedorov <serge.fdrv@gmail.com> wrote: > On 18/06/16 07:03, Pranith Kumar wrote: >> diff --git a/tcg/tcg.h b/tcg/tcg.h >> index db6a062..36feca9 100644 >> --- a/tcg/tcg.h >> +++ b/tcg/tcg.h >> @@ -408,6 +408,20 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) >> #define TCG_CALL_DUMMY_TCGV MAKE_TCGV_I32(-1) >> #define TCG_CALL_DUMMY_ARG ((TCGArg)(-1)) >> >> +typedef enum { >> + TCG_MO_LD_LD = 1, >> + TCG_MO_ST_LD = 2, >> + TCG_MO_LD_ST = 4, >> + TCG_MO_ST_ST = 8, >> + TCG_MO_ALL = 0xF, // OR of all above > > So TCG_MO_ALL specifies a so called "full" memory barrier? This enum just specifies what loads and stores need to be ordered. TCG_MO_ALL specifies that we need to order both previous loads and stores with later loads and stores. To get a full memory barrier you will need to pair it with BAR_SC: TCG_MO_ALL | TCG_BAR_SC > >> +} TCGOrder; >> + >> +typedef enum { >> + TCG_BAR_ACQ = 32, >> + TCG_BAR_REL = 64, > > I'm convinced that the only practical way to represent a standalone > acquire memory barrier is to order all previous loads with all > subsequent loads and stores. Similarly, a standalone release memory > barrier would order all previous loads and stores with all subsequent > stores. [1] Yes, here acquire would be: (TCG_MO_LD_ST | TCG_MO_LD_LD) | TCG_BAR_ACQ and release would be: (TCG_MO_ST_ST | TCG_MO_LD_ST) | TCG_BAR_REL > > On the other hand, acquire or release semantic associated with a memory > operation itself can be directly mapped into e.g. AArch64's Load-Acquire > (LDAR) and Store-Release (STLR) instructions. A standalone barrier > adjacent to a memory operation shouldn't be mapped this way because it > should provide more strict guarantees than e.g. AArch64 instructions > mentioned above. You are right. That is why the load-acquire operation generates the stronger barrier: TCG_MO_ALL | TCG_BAR_ACQ and not the acquire barrier above. Similarly for store-release. > > Therefore, I advocate for clear distinction between standalone memory > barriers and implicit memory ordering semantics associated with memory > operations themselves. Any suggestions on how to make the distinction clearer? I will add a detailed comment like the above but please let me know if you have anything in mind. > > [1] http://preshing.com/20130922/acquire-and-release-fences/ > >> + TCG_BAR_SC = 128, > > How's that different from TCG_MO_ALL? TCG_BAR_* tells us what ordering is enforced. TCG_MO_* tells what on what operations the ordering is to be enforced. Thanks,
Pranith Kumar <bobby.prani@gmail.com> writes: > Hi Sergey, > > On Mon, Jun 20, 2016 at 5:21 PM, Sergey Fedorov <serge.fdrv@gmail.com> wrote: >> On 18/06/16 07:03, Pranith Kumar wrote: >>> diff --git a/tcg/tcg.h b/tcg/tcg.h >>> index db6a062..36feca9 100644 >>> --- a/tcg/tcg.h >>> +++ b/tcg/tcg.h >>> @@ -408,6 +408,20 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) >>> #define TCG_CALL_DUMMY_TCGV MAKE_TCGV_I32(-1) >>> #define TCG_CALL_DUMMY_ARG ((TCGArg)(-1)) >>> >>> +typedef enum { >>> + TCG_MO_LD_LD = 1, >>> + TCG_MO_ST_LD = 2, >>> + TCG_MO_LD_ST = 4, >>> + TCG_MO_ST_ST = 8, >>> + TCG_MO_ALL = 0xF, // OR of all above >> >> So TCG_MO_ALL specifies a so called "full" memory barrier? > > This enum just specifies what loads and stores need to be ordered. > > TCG_MO_ALL specifies that we need to order both previous loads and > stores with later loads and stores. To get a full memory barrier you > will need to pair it with BAR_SC: > > TCG_MO_ALL | TCG_BAR_SC > >> >>> +} TCGOrder; >>> + >>> +typedef enum { >>> + TCG_BAR_ACQ = 32, >>> + TCG_BAR_REL = 64, >> >> I'm convinced that the only practical way to represent a standalone >> acquire memory barrier is to order all previous loads with all >> subsequent loads and stores. Similarly, a standalone release memory >> barrier would order all previous loads and stores with all subsequent >> stores. [1] > > Yes, here acquire would be: > > (TCG_MO_LD_ST | TCG_MO_LD_LD) | TCG_BAR_ACQ > > and release would be: > > (TCG_MO_ST_ST | TCG_MO_LD_ST) | TCG_BAR_REL > >> >> On the other hand, acquire or release semantic associated with a memory >> operation itself can be directly mapped into e.g. AArch64's Load-Acquire >> (LDAR) and Store-Release (STLR) instructions. A standalone barrier >> adjacent to a memory operation shouldn't be mapped this way because it >> should provide more strict guarantees than e.g. AArch64 instructions >> mentioned above. > > You are right. That is why the load-acquire operation generates the > stronger barrier: > > TCG_MO_ALL | TCG_BAR_ACQ and not the acquire barrier above. Similarly > for store-release. > >> >> Therefore, I advocate for clear distinction between standalone memory >> barriers and implicit memory ordering semantics associated with memory >> operations themselves. > > Any suggestions on how to make the distinction clearer? I will add a > detailed comment like the above but please let me know if you have > anything in mind. > >> >> [1] http://preshing.com/20130922/acquire-and-release-fences/ >> >>> + TCG_BAR_SC = 128, >> >> How's that different from TCG_MO_ALL? > > TCG_BAR_* tells us what ordering is enforced. TCG_MO_* tells what on > what operations the ordering is to be enforced. This would be worthwhile in the comments. I'm confused by the fact we have two sets of enums that are going to be merged when building TCGOp parameters. -- Alex Bennée
Pranith Kumar <bobby.prani@gmail.com> writes: > This commit introduces the TCGOpcode for memory barrier instruction. > > This opcode takes an argument which is the type of memory barrier > which should be generated. > > Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > tcg/README | 17 +++++++++++++++++ > tcg/tcg-op.c | 11 +++++++++++ > tcg/tcg-op.h | 2 ++ > tcg/tcg-opc.h | 2 ++ > tcg/tcg.h | 14 ++++++++++++++ > 5 files changed, 46 insertions(+) > > diff --git a/tcg/README b/tcg/README > index ce8beba..1d48aa9 100644 > --- a/tcg/README > +++ b/tcg/README > @@ -402,6 +402,23 @@ double-word product T0. The later is returned in two single-word outputs. > > Similar to mulu2, except the two inputs T1 and T2 are signed. > > +********* Memory Barrier support > + > +* mb <$arg> > + > +Generate a target memory barrier instruction to ensure memory ordering as being > +enforced by a corresponding guest memory barrier instruction. The ordering > +enforced by the backend may be stricter than the ordering required by the guest. > +It cannot be weaker. This opcode takes a constant argument which is required to > +generate the appropriate barrier instruction. The backend should take care to > +emit the target barrier instruction only when necessary i.e., for SMP guests and > +when MTTCG is enabled. > + > +The guest translators should generate this opcode for all guest instructions > +which have ordering side effects. > + > +Please see docs/atomics.txt for more information on memory barriers. > + > ********* 64-bit guest on 32-bit host support > > The following opcodes are internal to TCG. Thus they are to be implemented by > diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c > index 54c0277..08f7858 100644 > --- a/tcg/tcg-op.c > +++ b/tcg/tcg-op.c > @@ -39,6 +39,8 @@ extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64); > #define TCGV_HIGH TCGV_HIGH_link_error > #endif > > +extern int smp_cpus; > + > /* Note that this is optimized for sequential allocation during translate. > Up to and including filling in the forward link immediately. We'll do > proper termination of the end of the list after we finish translation. */ > @@ -146,6 +148,15 @@ void tcg_gen_op6(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2, > tcg_emit_op(ctx, opc, pi); > } > > +void tcg_gen_mb(TCGArg mb_type) > +{ > +#ifndef CONFIG_USER_ONLY > + if (qemu_tcg_mttcg_enabled() && smp_cpus > 1) { > + tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type); > + } > +#endif > +} > + This introduces a dependency on MTTCG which maybe is best handled in that patch series. I get the feeling we might be able to merge this series before MTTCG as barriers are still required for user-mode. Maybe something like this makes more sense to keep the patch series independent for now: void tcg_gen_mb(TCGArg mb_type) { #ifdef CONFIG_USER_ONLY const bool emit_barriers = true; #else /* TODO: When MTTCG is available for system mode * we will enable when qemu_tcg_mttcg_enabled() && smp_cpus > 1 */ bool emit_barriers = false; #endif if (emit_barriers) { tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type); } } Maybe it is time to look at the user-mode memory model testers like: http://diy.inria.fr/ Rich, What do you think? Do you think the memory barrier stuff should be kept independent so it can be merged once ready? > /* 32 bit ops */ > > void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) > diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h > index f217e80..41890cc 100644 > --- a/tcg/tcg-op.h > +++ b/tcg/tcg-op.h > @@ -261,6 +261,8 @@ static inline void tcg_gen_br(TCGLabel *l) > tcg_gen_op1(&tcg_ctx, INDEX_op_br, label_arg(l)); > } > > +void tcg_gen_mb(TCGArg a); > + > /* Helper calls. */ > > /* 32 bit ops */ > diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h > index 6d0410c..45528d2 100644 > --- a/tcg/tcg-opc.h > +++ b/tcg/tcg-opc.h > @@ -42,6 +42,8 @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END) > # define IMPL64 TCG_OPF_64BIT > #endif > > +DEF(mb, 0, 0, 1, 0) > + > DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT) > DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT) > DEF(setcond_i32, 1, 2, 1, 0) > diff --git a/tcg/tcg.h b/tcg/tcg.h > index db6a062..36feca9 100644 > --- a/tcg/tcg.h > +++ b/tcg/tcg.h > @@ -408,6 +408,20 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) > #define TCG_CALL_DUMMY_TCGV MAKE_TCGV_I32(-1) > #define TCG_CALL_DUMMY_ARG ((TCGArg)(-1)) > > +typedef enum { > + TCG_MO_LD_LD = 1, > + TCG_MO_ST_LD = 2, > + TCG_MO_LD_ST = 4, > + TCG_MO_ST_ST = 8, > + TCG_MO_ALL = 0xF, // OR of all above > +} TCGOrder; > + > +typedef enum { > + TCG_BAR_ACQ = 32, > + TCG_BAR_REL = 64, > + TCG_BAR_SC = 128, > +} TCGBar; I mentioned my confusing in another email about having separate enums here. > + > /* Conditions. Note that these are laid out for easy manipulation by > the functions below: > bit 0 is used for inverting; -- Alex Bennée
On Tue, Jun 21, 2016 at 11:09 AM, Alex Bennée <alex.bennee@linaro.org> wrote: >>> >>>> + TCG_BAR_SC = 128, >>> >>> How's that different from TCG_MO_ALL? >> >> TCG_BAR_* tells us what ordering is enforced. TCG_MO_* tells what on >> what operations the ordering is to be enforced. > > This would be worthwhile in the comments. I'm confused by the fact we > have two sets of enums that are going to be merged when building TCGOp > parameters. OK, I will add these comments with the details.
On Tue, Jun 21, 2016 at 2:04 PM, Alex Bennée <alex.bennee@linaro.org> wrote: > > Pranith Kumar <bobby.prani@gmail.com> writes: > >> This commit introduces the TCGOpcode for memory barrier instruction. >> >> This opcode takes an argument which is the type of memory barrier >> which should be generated. >> >> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> >> Signed-off-by: Richard Henderson <rth@twiddle.net> >> --- >> tcg/README | 17 +++++++++++++++++ >> tcg/tcg-op.c | 11 +++++++++++ >> tcg/tcg-op.h | 2 ++ >> tcg/tcg-opc.h | 2 ++ >> tcg/tcg.h | 14 ++++++++++++++ >> 5 files changed, 46 insertions(+) >> >> diff --git a/tcg/README b/tcg/README >> index ce8beba..1d48aa9 100644 >> --- a/tcg/README >> +++ b/tcg/README >> @@ -402,6 +402,23 @@ double-word product T0. The later is returned in two single-word outputs. >> >> Similar to mulu2, except the two inputs T1 and T2 are signed. >> >> +********* Memory Barrier support >> + >> +* mb <$arg> >> + >> +Generate a target memory barrier instruction to ensure memory ordering as being >> +enforced by a corresponding guest memory barrier instruction. The ordering >> +enforced by the backend may be stricter than the ordering required by the guest. >> +It cannot be weaker. This opcode takes a constant argument which is required to >> +generate the appropriate barrier instruction. The backend should take care to >> +emit the target barrier instruction only when necessary i.e., for SMP guests and >> +when MTTCG is enabled. >> + >> +The guest translators should generate this opcode for all guest instructions >> +which have ordering side effects. >> + >> +Please see docs/atomics.txt for more information on memory barriers. >> + >> ********* 64-bit guest on 32-bit host support >> >> The following opcodes are internal to TCG. Thus they are to be implemented by >> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c >> index 54c0277..08f7858 100644 >> --- a/tcg/tcg-op.c >> +++ b/tcg/tcg-op.c >> @@ -39,6 +39,8 @@ extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64); >> #define TCGV_HIGH TCGV_HIGH_link_error >> #endif >> >> +extern int smp_cpus; >> + >> /* Note that this is optimized for sequential allocation during translate. >> Up to and including filling in the forward link immediately. We'll do >> proper termination of the end of the list after we finish translation. */ >> @@ -146,6 +148,15 @@ void tcg_gen_op6(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2, >> tcg_emit_op(ctx, opc, pi); >> } >> >> +void tcg_gen_mb(TCGArg mb_type) >> +{ >> +#ifndef CONFIG_USER_ONLY >> + if (qemu_tcg_mttcg_enabled() && smp_cpus > 1) { >> + tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type); >> + } >> +#endif >> +} >> + > > This introduces a dependency on MTTCG which maybe is best handled in > that patch series. I get the feeling we might be able to merge this > series before MTTCG as barriers are still required for user-mode. Maybe > something like this makes more sense to keep the patch series > independent for now: > > void tcg_gen_mb(TCGArg mb_type) > { > #ifdef CONFIG_USER_ONLY > const bool emit_barriers = true; > #else > /* TODO: When MTTCG is available for system mode > * we will enable when qemu_tcg_mttcg_enabled() && smp_cpus > 1 > */ > bool emit_barriers = false; > #endif > if (emit_barriers) { > tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type); > } > } I was basing my patches on top of your mttcg base tree, since I can test these patches only with MTTCG enabled :) If you think I should rebase on mainine, I will do so with the changes you suggested above. > > Maybe it is time to look at the user-mode memory model testers like: > > http://diy.inria.fr/ Interesting. I will take a look. Thanks for the link. > > Rich, > > What do you think? Do you think the memory barrier stuff should be kept > independent so it can be merged once ready? > >> /* 32 bit ops */ >> >> void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) >> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h >> index f217e80..41890cc 100644 >> --- a/tcg/tcg-op.h >> +++ b/tcg/tcg-op.h >> @@ -261,6 +261,8 @@ static inline void tcg_gen_br(TCGLabel *l) >> tcg_gen_op1(&tcg_ctx, INDEX_op_br, label_arg(l)); >> } >> >> +void tcg_gen_mb(TCGArg a); >> + >> /* Helper calls. */ >> >> /* 32 bit ops */ >> diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h >> index 6d0410c..45528d2 100644 >> --- a/tcg/tcg-opc.h >> +++ b/tcg/tcg-opc.h >> @@ -42,6 +42,8 @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END) >> # define IMPL64 TCG_OPF_64BIT >> #endif >> >> +DEF(mb, 0, 0, 1, 0) >> + >> DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT) >> DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT) >> DEF(setcond_i32, 1, 2, 1, 0) >> diff --git a/tcg/tcg.h b/tcg/tcg.h >> index db6a062..36feca9 100644 >> --- a/tcg/tcg.h >> +++ b/tcg/tcg.h >> @@ -408,6 +408,20 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) >> #define TCG_CALL_DUMMY_TCGV MAKE_TCGV_I32(-1) >> #define TCG_CALL_DUMMY_ARG ((TCGArg)(-1)) >> >> +typedef enum { >> + TCG_MO_LD_LD = 1, >> + TCG_MO_ST_LD = 2, >> + TCG_MO_LD_ST = 4, >> + TCG_MO_ST_ST = 8, >> + TCG_MO_ALL = 0xF, // OR of all above >> +} TCGOrder; >> + >> +typedef enum { >> + TCG_BAR_ACQ = 32, >> + TCG_BAR_REL = 64, >> + TCG_BAR_SC = 128, >> +} TCGBar; > > I mentioned my confusing in another email about having separate enums here. Noted. I will add a comment explaining the split.
Pranith Kumar <bobby.prani@gmail.com> writes: > On Tue, Jun 21, 2016 at 2:04 PM, Alex Bennée <alex.bennee@linaro.org> wrote: >> >> Pranith Kumar <bobby.prani@gmail.com> writes: >> >>> This commit introduces the TCGOpcode for memory barrier instruction. >>> >>> This opcode takes an argument which is the type of memory barrier >>> which should be generated. >>> >>> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> >>> Signed-off-by: Richard Henderson <rth@twiddle.net> >>> --- >>> tcg/README | 17 +++++++++++++++++ >>> tcg/tcg-op.c | 11 +++++++++++ >>> tcg/tcg-op.h | 2 ++ >>> tcg/tcg-opc.h | 2 ++ >>> tcg/tcg.h | 14 ++++++++++++++ >>> 5 files changed, 46 insertions(+) >>> >>> diff --git a/tcg/README b/tcg/README >>> index ce8beba..1d48aa9 100644 >>> --- a/tcg/README >>> +++ b/tcg/README >>> @@ -402,6 +402,23 @@ double-word product T0. The later is returned in two single-word outputs. >>> >>> Similar to mulu2, except the two inputs T1 and T2 are signed. >>> >>> +********* Memory Barrier support >>> + >>> +* mb <$arg> >>> + >>> +Generate a target memory barrier instruction to ensure memory ordering as being >>> +enforced by a corresponding guest memory barrier instruction. The ordering >>> +enforced by the backend may be stricter than the ordering required by the guest. >>> +It cannot be weaker. This opcode takes a constant argument which is required to >>> +generate the appropriate barrier instruction. The backend should take care to >>> +emit the target barrier instruction only when necessary i.e., for SMP guests and >>> +when MTTCG is enabled. >>> + >>> +The guest translators should generate this opcode for all guest instructions >>> +which have ordering side effects. >>> + >>> +Please see docs/atomics.txt for more information on memory barriers. >>> + >>> ********* 64-bit guest on 32-bit host support >>> >>> The following opcodes are internal to TCG. Thus they are to be implemented by >>> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c >>> index 54c0277..08f7858 100644 >>> --- a/tcg/tcg-op.c >>> +++ b/tcg/tcg-op.c >>> @@ -39,6 +39,8 @@ extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64); >>> #define TCGV_HIGH TCGV_HIGH_link_error >>> #endif >>> >>> +extern int smp_cpus; >>> + >>> /* Note that this is optimized for sequential allocation during translate. >>> Up to and including filling in the forward link immediately. We'll do >>> proper termination of the end of the list after we finish translation. */ >>> @@ -146,6 +148,15 @@ void tcg_gen_op6(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2, >>> tcg_emit_op(ctx, opc, pi); >>> } >>> >>> +void tcg_gen_mb(TCGArg mb_type) >>> +{ >>> +#ifndef CONFIG_USER_ONLY >>> + if (qemu_tcg_mttcg_enabled() && smp_cpus > 1) { >>> + tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type); >>> + } >>> +#endif >>> +} >>> + >> >> This introduces a dependency on MTTCG which maybe is best handled in >> that patch series. I get the feeling we might be able to merge this >> series before MTTCG as barriers are still required for user-mode. Maybe >> something like this makes more sense to keep the patch series >> independent for now: >> >> void tcg_gen_mb(TCGArg mb_type) >> { >> #ifdef CONFIG_USER_ONLY >> const bool emit_barriers = true; >> #else >> /* TODO: When MTTCG is available for system mode >> * we will enable when qemu_tcg_mttcg_enabled() && smp_cpus > 1 >> */ >> bool emit_barriers = false; >> #endif >> if (emit_barriers) { >> tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type); >> } >> } > > I was basing my patches on top of your mttcg base tree, since I can > test these patches only with MTTCG enabled :) > > If you think I should rebase on mainine, I will do so with the changes > you suggested above. Well I'll be guided by what Richard thinks about the chances of this merging ahead of the main MTTCG patches. You were mentioning the trouble with testing with the kvm-unit-tests based tests which all need MTTCG system mode to trigger problems. However user mode emulation "works" now (even better since Peter's signal fixes) so it might be worth investigating if we can leverage some user-mode testing instead. Also generally keeping patch series clean and self contained stops too large a pile of patches getting pilled up waiting for the controversial bits to get reviewed and merged. > >> >> Maybe it is time to look at the user-mode memory model testers like: >> >> http://diy.inria.fr/ > > Interesting. I will take a look. Thanks for the link. > >> >> Rich, >> >> What do you think? Do you think the memory barrier stuff should be kept >> independent so it can be merged once ready? >> >>> /* 32 bit ops */ >>> >>> void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) >>> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h >>> index f217e80..41890cc 100644 >>> --- a/tcg/tcg-op.h >>> +++ b/tcg/tcg-op.h >>> @@ -261,6 +261,8 @@ static inline void tcg_gen_br(TCGLabel *l) >>> tcg_gen_op1(&tcg_ctx, INDEX_op_br, label_arg(l)); >>> } >>> >>> +void tcg_gen_mb(TCGArg a); >>> + >>> /* Helper calls. */ >>> >>> /* 32 bit ops */ >>> diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h >>> index 6d0410c..45528d2 100644 >>> --- a/tcg/tcg-opc.h >>> +++ b/tcg/tcg-opc.h >>> @@ -42,6 +42,8 @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END) >>> # define IMPL64 TCG_OPF_64BIT >>> #endif >>> >>> +DEF(mb, 0, 0, 1, 0) >>> + >>> DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT) >>> DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT) >>> DEF(setcond_i32, 1, 2, 1, 0) >>> diff --git a/tcg/tcg.h b/tcg/tcg.h >>> index db6a062..36feca9 100644 >>> --- a/tcg/tcg.h >>> +++ b/tcg/tcg.h >>> @@ -408,6 +408,20 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) >>> #define TCG_CALL_DUMMY_TCGV MAKE_TCGV_I32(-1) >>> #define TCG_CALL_DUMMY_ARG ((TCGArg)(-1)) >>> >>> +typedef enum { >>> + TCG_MO_LD_LD = 1, >>> + TCG_MO_ST_LD = 2, >>> + TCG_MO_LD_ST = 4, >>> + TCG_MO_ST_ST = 8, >>> + TCG_MO_ALL = 0xF, // OR of all above >>> +} TCGOrder; >>> + >>> +typedef enum { >>> + TCG_BAR_ACQ = 32, >>> + TCG_BAR_REL = 64, >>> + TCG_BAR_SC = 128, >>> +} TCGBar; >> >> I mentioned my confusing in another email about having separate enums here. > > > Noted. I will add a comment explaining the split. -- Alex Bennée
On 06/21/2016 11:23 AM, Alex Bennée wrote: >> If you think I should rebase on mainine, I will do so with the changes >> you suggested above. > > Well I'll be guided by what Richard thinks about the chances of this > merging ahead of the main MTTCG patches. You were mentioning the trouble > with testing with the kvm-unit-tests based tests which all need MTTCG > system mode to trigger problems. However user mode emulation "works" now > (even better since Peter's signal fixes) so it might be worth > investigating if we can leverage some user-mode testing instead. > > Also generally keeping patch series clean and self contained stops too > large a pile of patches getting pilled up waiting for the controversial > bits to get reviewed and merged. I'm willing to merge this before mttcg. I'm with Alex that we should minimize the amount of code that gets blocked by that patch set. r~
On 21/06/16 17:52, Pranith Kumar wrote: > Hi Sergey, > > On Mon, Jun 20, 2016 at 5:21 PM, Sergey Fedorov <serge.fdrv@gmail.com> wrote: >> On 18/06/16 07:03, Pranith Kumar wrote: >>> diff --git a/tcg/tcg.h b/tcg/tcg.h >>> index db6a062..36feca9 100644 >>> --- a/tcg/tcg.h >>> +++ b/tcg/tcg.h >>> @@ -408,6 +408,20 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) >>> #define TCG_CALL_DUMMY_TCGV MAKE_TCGV_I32(-1) >>> #define TCG_CALL_DUMMY_ARG ((TCGArg)(-1)) >>> >>> +typedef enum { >>> + TCG_MO_LD_LD = 1, >>> + TCG_MO_ST_LD = 2, >>> + TCG_MO_LD_ST = 4, >>> + TCG_MO_ST_ST = 8, >>> + TCG_MO_ALL = 0xF, // OR of all above >> So TCG_MO_ALL specifies a so called "full" memory barrier? > This enum just specifies what loads and stores need to be ordered. > > TCG_MO_ALL specifies that we need to order both previous loads and > stores with later loads and stores. To get a full memory barrier you > will need to pair it with BAR_SC: > > TCG_MO_ALL | TCG_BAR_SC If we define the semantics for the flags above as it is defined for corresponding Sparc MEMBAR instruction mmask bits (which I think really makes sense), then a combination of all of these flags makes a full memory barrier which guarantees transitivity, i.e. sequential consistency. Let me just quote [Sparc v9 manual] regarding MEMBAR instruction mmask encoding: #StoreStore The effects of all stores appearing prior to the MEMBAR instruction must be visible *to all processors* before the effect of any stores following the MEMBAR. #LoadStore All loads appearing prior to the MEMBAR instruction must have been performed before the effect of any stores following the MEMBAR is visible *to any other processor*. #StoreLoad The effects of all stores appearing prior to the MEMBAR instruction must be visible *to all processors* before loads following the MEMBAR may be performed. #LoadLoad All loads appearing prior to the MEMBAR instruction must have been performed before any loads following the MEMBAR may be performed. I'm emphasising "to all processors" and "to any other processor" here because these expressions suggest transitivity, if I understand it correctly. [Sparc v9 manual] http://sparc.org/wp-content/uploads/2014/01/SPARCV9.pdf.gz > >>> +} TCGOrder; >>> + >>> +typedef enum { >>> + TCG_BAR_ACQ = 32, >>> + TCG_BAR_REL = 64, >> I'm convinced that the only practical way to represent a standalone >> acquire memory barrier is to order all previous loads with all >> subsequent loads and stores. Similarly, a standalone release memory >> barrier would order all previous loads and stores with all subsequent >> stores. [1] > Yes, here acquire would be: > > (TCG_MO_LD_ST | TCG_MO_LD_LD) | TCG_BAR_ACQ > > and release would be: > > (TCG_MO_ST_ST | TCG_MO_LD_ST) | TCG_BAR_REL Could you please explain the difference between: (TCG_MO_LD_ST | TCG_MO_LD_LD) | TCG_BAR_ACQ and (TCG_MO_LD_ST | TCG_MO_LD_LD) and TCG_BAR_ACQ or between: (TCG_MO_ST_ST | TCG_MO_LD_ST) | TCG_BAR_REL and (TCG_MO_ST_ST | TCG_MO_LD_ST) and TCG_BAR_REL ? (Please first consider the comments below and above.) > >> On the other hand, acquire or release semantic associated with a memory >> operation itself can be directly mapped into e.g. AArch64's Load-Acquire >> (LDAR) and Store-Release (STLR) instructions. A standalone barrier >> adjacent to a memory operation shouldn't be mapped this way because it >> should provide more strict guarantees than e.g. AArch64 instructions >> mentioned above. > You are right. That is why the load-acquire operation generates the > stronger barrier: > > TCG_MO_ALL | TCG_BAR_ACQ and not the acquire barrier above. Similarly > for store-release. I meant that e.g. Sparc TSO load could be efficiently mapped to ARMv8 or Itanium load-acquire and Sparc TSO store - to ARMv8 or Itanium store-release. But Sparc TSO load + membar #LoadLoad | #LoadStore cannot be mapped this way because ARMv8 or Itanium load-acquire semantics is weaker than the corresponding standalone barrier semantics. > >> Therefore, I advocate for clear distinction between standalone memory >> barriers and implicit memory ordering semantics associated with memory >> operations themselves. > Any suggestions on how to make the distinction clearer? I will add a > detailed comment like the above but please let me know if you have > anything in mind. My suggestion is to separate standalone memory barriers and implicit ordering requirements of loads/stores. Then a standalone guest memory barrier instruction will translate into a standalone TCG memory barrier operation which will translate into a standalone host memory barrier instruction (possibly no-op). We can take the semantic of Sparc v9 MEMBAR instruction mmask bits as the semantic of our standalone TCG memory barrier operation as described above. There seems to be no special "release" or "acquire" memory barrier as a standalone instruction in our guest/host architectures. I'm convinced that the best way to represent [C11] acquire fence is a combined memory barrier for load-load and load-store; the best way to represent [C11] release fence is a combined barrier for store-store and load-store; and the best way to represent [C11] sequential consistency fence is a combined barrier for all four flags. Orthogonally, we can attribute each guest memory load/store TCG operation with "acquire", "release" and "sequentially consistent" flags with the semantics as defined for [C11] 'memory_order' enum constants. I think we can skip "consume" semantics since it does only make sense for the ancient Alpha architecture which we don't support as QEMU host; let's just require that each load is always a consume-load. Then e.g. x86 or Sparc TSO regular load instruction will translate into TCG guest memory load operation with acquire flag set which will translate into e.g. Itanium or ARMv8 load-acquire instruction; and e.g. x86 or Sparc TSO regular store instruction will translate into TCG guest memory store operation with release flag set which will translate into e.g. Itanium or ARMv8 store-release instruction. That's supposed to be the most efficient way to support a strongly-ordered guest on a weakly-ordered host. Even if we disable MTTCG for such guest-host combinations, we may still like to support user-mode emulation for them which can happen to be multi-threaded. The key difference between: (1) a regular load followed by a combined memory barrier for load-load and load-store, and load-acquire; (2) a combined barrier for store-store and load-store followed by a regular store, and store-release - is that load-acquire/store-release is always associated with a particular address loaded/stored whereas a standalone barrier is supposed to order *all* corresponding loads/stores across the barrier. Thus a standalone barrier are stronger then a load-acquire/store-release and cannot be an efficient intermediate representation for this semantics. See also this email: http://thread.gmane.org/gmane.comp.emulators.qemu/420223/focus=421309 I would suggest to deal only with explicit standalone memory barrier instruction translation in this series. After that, we could start with the second part of implicit memory ordering requirements which is more complex topic anyway. [C11] http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf > >> [1] http://preshing.com/20130922/acquire-and-release-fences/ >> >>> + TCG_BAR_SC = 128, >> How's that different from TCG_MO_ALL? > TCG_BAR_* tells us what ordering is enforced. TCG_MO_* tells what on > what operations the ordering is to be enforced. It sound like unnecessary duplication of the same information, see the explanation above. Kind regards, Sergey
diff --git a/tcg/README b/tcg/README index ce8beba..1d48aa9 100644 --- a/tcg/README +++ b/tcg/README @@ -402,6 +402,23 @@ double-word product T0. The later is returned in two single-word outputs. Similar to mulu2, except the two inputs T1 and T2 are signed. +********* Memory Barrier support + +* mb <$arg> + +Generate a target memory barrier instruction to ensure memory ordering as being +enforced by a corresponding guest memory barrier instruction. The ordering +enforced by the backend may be stricter than the ordering required by the guest. +It cannot be weaker. This opcode takes a constant argument which is required to +generate the appropriate barrier instruction. The backend should take care to +emit the target barrier instruction only when necessary i.e., for SMP guests and +when MTTCG is enabled. + +The guest translators should generate this opcode for all guest instructions +which have ordering side effects. + +Please see docs/atomics.txt for more information on memory barriers. + ********* 64-bit guest on 32-bit host support The following opcodes are internal to TCG. Thus they are to be implemented by diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 54c0277..08f7858 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -39,6 +39,8 @@ extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64); #define TCGV_HIGH TCGV_HIGH_link_error #endif +extern int smp_cpus; + /* Note that this is optimized for sequential allocation during translate. Up to and including filling in the forward link immediately. We'll do proper termination of the end of the list after we finish translation. */ @@ -146,6 +148,15 @@ void tcg_gen_op6(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2, tcg_emit_op(ctx, opc, pi); } +void tcg_gen_mb(TCGArg mb_type) +{ +#ifndef CONFIG_USER_ONLY + if (qemu_tcg_mttcg_enabled() && smp_cpus > 1) { + tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type); + } +#endif +} + /* 32 bit ops */ void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h index f217e80..41890cc 100644 --- a/tcg/tcg-op.h +++ b/tcg/tcg-op.h @@ -261,6 +261,8 @@ static inline void tcg_gen_br(TCGLabel *l) tcg_gen_op1(&tcg_ctx, INDEX_op_br, label_arg(l)); } +void tcg_gen_mb(TCGArg a); + /* Helper calls. */ /* 32 bit ops */ diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h index 6d0410c..45528d2 100644 --- a/tcg/tcg-opc.h +++ b/tcg/tcg-opc.h @@ -42,6 +42,8 @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END) # define IMPL64 TCG_OPF_64BIT #endif +DEF(mb, 0, 0, 1, 0) + DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT) DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT) DEF(setcond_i32, 1, 2, 1, 0) diff --git a/tcg/tcg.h b/tcg/tcg.h index db6a062..36feca9 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -408,6 +408,20 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t) #define TCG_CALL_DUMMY_TCGV MAKE_TCGV_I32(-1) #define TCG_CALL_DUMMY_ARG ((TCGArg)(-1)) +typedef enum { + TCG_MO_LD_LD = 1, + TCG_MO_ST_LD = 2, + TCG_MO_LD_ST = 4, + TCG_MO_ST_ST = 8, + TCG_MO_ALL = 0xF, // OR of all above +} TCGOrder; + +typedef enum { + TCG_BAR_ACQ = 32, + TCG_BAR_REL = 64, + TCG_BAR_SC = 128, +} TCGBar; + /* Conditions. Note that these are laid out for easy manipulation by the functions below: bit 0 is used for inverting;