Message ID | 1-v5-cd1be8dd9c71+3fa-smmuv3_newapi_p1_jgg@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v5,01/17] iommu/arm-smmu-v3: Make STE programming independent of the callers | expand |
Hi Jason, On Tue, Feb 06, 2024 at 11:12:38AM -0400, Jason Gunthorpe wrote: > As the comment in arm_smmu_write_strtab_ent() explains, this routine has > been limited to only work correctly in certain scenarios that the caller > must ensure. Generally the caller must put the STE into ABORT or BYPASS > before attempting to program it to something else. This is looking pretty good now, but I have a few comments inline. > drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 330 ++++++++++++++++---- > 1 file changed, 263 insertions(+), 67 deletions(-) > > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c > index 0ffb1cf17e0b2e..f0b915567cbcdc 100644 > --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c > @@ -48,6 +48,21 @@ enum arm_smmu_msi_index { > ARM_SMMU_MAX_MSIS, > }; > > +struct arm_smmu_entry_writer_ops; > +struct arm_smmu_entry_writer { > + const struct arm_smmu_entry_writer_ops *ops; > + struct arm_smmu_master *master; > +}; > + > +struct arm_smmu_entry_writer_ops { > + unsigned int num_entry_qwords; > + __le64 v_bit; > + void (*get_used)(const __le64 *entry, __le64 *used); > + void (*sync)(struct arm_smmu_entry_writer *writer); > +}; Can we avoid the indirection for now, please? I'm sure we'll want it later when you extend this to CDs, but for the initial support it just makes it more difficult to follow the flow. Should be a trivial thing to drop, I hope. > +static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) > { > + unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0])); > + > + used_bits[0] = cpu_to_le64(STRTAB_STE_0_V); > + if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) > + return; > + > + /* > + * See 13.5 Summary of attribute/permission configuration fields for the > + * SHCFG behavior. It is only used for BYPASS, including S1DSS BYPASS, > + * and S2 only. > + */ > + if (cfg == STRTAB_STE_0_CFG_BYPASS || > + cfg == STRTAB_STE_0_CFG_S2_TRANS || > + (cfg == STRTAB_STE_0_CFG_S1_TRANS && > + FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == > + STRTAB_STE_1_S1DSS_BYPASS)) > + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); Huh, SHCFG is really getting in the way here, isn't it? I think it also means we don't have a "hitless" transition from stage-2 translation -> bypass. I'm inclined to leave it set to "use incoming" all the time; the only difference I can see is if you have stage-2 translation and a master emitting outer-shareable transactions, in which case they'd now be outer-shareable instead of inner-shareable, which I think is harmless. Additionally, it looks like there's an existing buglet here in that we shouldn't set SHCFG if SMMU_IDR1.ATTR_TYPES_OVR == 0. > + > + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_CFG); > + switch (cfg) { > + case STRTAB_STE_0_CFG_ABORT: > + case STRTAB_STE_0_CFG_BYPASS: > + break; > + case STRTAB_STE_0_CFG_S1_TRANS: > + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT | > + STRTAB_STE_0_S1CTXPTR_MASK | > + STRTAB_STE_0_S1CDMAX); > + used_bits[1] |= > + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | > + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | > + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW); > + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_EATS); > + used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID); > + break; > + case STRTAB_STE_0_CFG_S2_TRANS: > + used_bits[1] |= > + cpu_to_le64(STRTAB_STE_1_EATS); > + used_bits[2] |= > + cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | > + STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | > + STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R); > + used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK); > + break; With SHCFG fixed, can we go a step further with this and simply identify the live qwords directly, rather than on a field-by-field basis? I think we should be able to do the same "hitless" transitions you want with the coarser granularity. Will
On Thu, Feb 15, 2024 at 01:49:53PM +0000, Will Deacon wrote: > Hi Jason, > > On Tue, Feb 06, 2024 at 11:12:38AM -0400, Jason Gunthorpe wrote: > > As the comment in arm_smmu_write_strtab_ent() explains, this routine has > > been limited to only work correctly in certain scenarios that the caller > > must ensure. Generally the caller must put the STE into ABORT or BYPASS > > before attempting to program it to something else. > > This is looking pretty good now, but I have a few comments inline. Ok > > @@ -48,6 +48,21 @@ enum arm_smmu_msi_index { > > ARM_SMMU_MAX_MSIS, > > }; > > > > +struct arm_smmu_entry_writer_ops; > > +struct arm_smmu_entry_writer { > > + const struct arm_smmu_entry_writer_ops *ops; > > + struct arm_smmu_master *master; > > +}; > > + > > +struct arm_smmu_entry_writer_ops { > > + unsigned int num_entry_qwords; > > + __le64 v_bit; > > + void (*get_used)(const __le64 *entry, __le64 *used); > > + void (*sync)(struct arm_smmu_entry_writer *writer); > > +}; > > Can we avoid the indirection for now, please? I'm sure we'll want it later > when you extend this to CDs, but for the initial support it just makes it > more difficult to follow the flow. Should be a trivial thing to drop, I > hope. We can. > > +static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) > > { > > + unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0])); > > + > > + used_bits[0] = cpu_to_le64(STRTAB_STE_0_V); > > + if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) > > + return; > > + > > + /* > > + * See 13.5 Summary of attribute/permission configuration fields for the > > + * SHCFG behavior. It is only used for BYPASS, including S1DSS BYPASS, > > + * and S2 only. > > + */ > > + if (cfg == STRTAB_STE_0_CFG_BYPASS || > > + cfg == STRTAB_STE_0_CFG_S2_TRANS || > > + (cfg == STRTAB_STE_0_CFG_S1_TRANS && > > + FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == > > + STRTAB_STE_1_S1DSS_BYPASS)) > > + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); > > Huh, SHCFG is really getting in the way here, isn't it? I wouldn't say that.. It is just a complicated bit of the spec. One of the things we recently did was to audit all the cache settings and, at least, we then realized that SHCFG was being subtly used by S2 as well.. Not sure if that was intentional or if it was just missed from the spec that the S2 uses the value too. From that perspective I view this layout of used to be valuable. It forces the kind of reflection and rigor that I think is helpful. The fact we found a thing to improve on by inspection is proof of this worth to me. > I think it also means we don't have a "hitless" transition from > stage-2 translation -> bypass. Hmm, I didn't notice that. The kunit passed: [ 0.511483] 1..1 [ 0.511510] KTAP version 1 [ 0.511551] # Subtest: arm-smmu-v3-kunit-test [ 0.511592] # module: arm_smmu_v3_test [ 0.511594] 1..10 [ 0.511910] ok 1 arm_smmu_v3_write_ste_test_bypass_to_abort [ 0.512110] ok 2 arm_smmu_v3_write_ste_test_abort_to_bypass [ 0.512386] ok 3 arm_smmu_v3_write_ste_test_cdtable_to_abort [ 0.512631] ok 4 arm_smmu_v3_write_ste_test_abort_to_cdtable [ 0.512874] ok 5 arm_smmu_v3_write_ste_test_cdtable_to_bypass [ 0.513075] ok 6 arm_smmu_v3_write_ste_test_bypass_to_cdtable [ 0.513275] ok 7 arm_smmu_v3_write_ste_test_cdtable_s1dss_change [ 0.513466] ok 8 arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass [ 0.513672] ok 9 arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass [ 0.514148] ok 10 arm_smmu_v3_write_ste_test_non_hitless Which I see is because it did not test the S2 case... > I'm inclined to leave it set to "use incoming" all the time; the > only difference I can see is if you have stage-2 translation and a > master emitting outer-shareable transactions, in which case they'd now > be outer-shareable instead of inner-shareable, which I think is harmless. Broadly it seems to me to make sense that the iommu would try to have a consistent translation - that bypass and S2 use different cachability doesn't seem great. But isn't the current S2 value of 0 "non-sharable"? > Additionally, it looks like there's an existing buglet here in that we > shouldn't set SHCFG if SMMU_IDR1.ATTR_TYPES_OVR == 0. Ah because the spec says RES0.. I'll add these two into the pile of random stuff in part 3 > > + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_CFG); > > + switch (cfg) { > > + case STRTAB_STE_0_CFG_ABORT: > > + case STRTAB_STE_0_CFG_BYPASS: > > + break; > > + case STRTAB_STE_0_CFG_S1_TRANS: > > + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT | > > + STRTAB_STE_0_S1CTXPTR_MASK | > > + STRTAB_STE_0_S1CDMAX); > > + used_bits[1] |= > > + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | > > + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | > > + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW); > > + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_EATS); > > + used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID); > > + break; > > + case STRTAB_STE_0_CFG_S2_TRANS: > > + used_bits[1] |= > > + cpu_to_le64(STRTAB_STE_1_EATS); > > + used_bits[2] |= > > + cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | > > + STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | > > + STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R); > > + used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK); > > + break; > > With SHCFG fixed, can we go a step further with this and simply identify > the live qwords directly, rather than on a field-by-field basis? I think > we should be able to do the same "hitless" transitions you want with the > coarser granularity. Not naively, Michael's excellent unit test shows it.. My understanding of your idea was roughly thus: void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) { unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0])); used_bits[0] = U64_MAX; if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) return; /* * See 13.5 Summary of attribute/permission configuration fields for the * SHCFG behavior. It is only used for BYPASS, including S1DSS BYPASS, * and S2 only. */ if (cfg == STRTAB_STE_0_CFG_BYPASS || cfg == STRTAB_STE_0_CFG_S2_TRANS || (cfg == STRTAB_STE_0_CFG_S1_TRANS && FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == STRTAB_STE_1_S1DSS_BYPASS)) used_bits[1] |= U64_MAX; used_bits[0] |= U64_MAX; switch (cfg) { case STRTAB_STE_0_CFG_ABORT: case STRTAB_STE_0_CFG_BYPASS: break; case STRTAB_STE_0_CFG_S1_TRANS: used_bits[0] |= U64_MAX; used_bits[1] |= U64_MAX; used_bits[2] |= U64_MAX; break; case STRTAB_STE_0_CFG_NESTED: used_bits[0] |= U64_MAX; used_bits[1] |= U64_MAX; fallthrough; case STRTAB_STE_0_CFG_S2_TRANS: used_bits[1] |= U64_MAX; used_bits[2] |= U64_MAX; used_bits[3] |= U64_MAX; break; default: memset(used_bits, 0xFF, sizeof(struct arm_smmu_ste)); WARN_ON(true); } } And the failures: [ 0.500676] ok 1 arm_smmu_v3_write_ste_test_bypass_to_abort [ 0.500818] ok 2 arm_smmu_v3_write_ste_test_abort_to_bypass [ 0.501014] ok 3 arm_smmu_v3_write_ste_test_cdtable_to_abort [ 0.501197] ok 4 arm_smmu_v3_write_ste_test_abort_to_cdtable [ 0.501340] # arm_smmu_v3_write_ste_test_cdtable_to_bypass: EXPECTATION FAILED at drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c:128 [ 0.501340] Expected test_writer.invalid_entry_written == !hitless, but [ 0.501340] test_writer.invalid_entry_written == 1 (0x1) [ 0.501340] !hitless == 0 (0x0) [ 0.501489] not ok 5 arm_smmu_v3_write_ste_test_cdtable_to_bypass [ 0.501787] # arm_smmu_v3_write_ste_test_bypass_to_cdtable: EXPECTATION FAILED at drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c:128 [ 0.501787] Expected test_writer.invalid_entry_written == !hitless, but [ 0.501787] test_writer.invalid_entry_written == 1 (0x1) [ 0.501787] !hitless == 0 (0x0) [ 0.501931] not ok 6 arm_smmu_v3_write_ste_test_bypass_to_cdtable [ 0.502274] ok 7 arm_smmu_v3_write_ste_test_cdtable_s1dss_change [ 0.502397] # arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass: EXPECTATION FAILED at drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c:128 [ 0.502397] Expected test_writer.invalid_entry_written == !hitless, but [ 0.502397] test_writer.invalid_entry_written == 1 (0x1) [ 0.502397] !hitless == 0 (0x0) [ 0.502473] # arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass: EXPECTATION FAILED at drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c:129 [ 0.502473] Expected test_writer.num_syncs == num_syncs_expected, but [ 0.502473] test_writer.num_syncs == 3 (0x3) [ 0.502473] num_syncs_expected == 2 (0x2) [ 0.502784] not ok 8 arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass [ 0.503073] # arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass: EXPECTATION FAILED at drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c:128 [ 0.503073] Expected test_writer.invalid_entry_written == !hitless, but [ 0.503073] test_writer.invalid_entry_written == 1 (0x1) [ 0.503073] !hitless == 0 (0x0) [ 0.503176] # arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass: EXPECTATION FAILED at drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c:129 [ 0.503176] Expected test_writer.num_syncs == num_syncs_expected, but [ 0.503176] test_writer.num_syncs == 3 (0x3) [ 0.503176] num_syncs_expected == 2 (0x2) [ 0.503464] not ok 9 arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass [ 0.503807] ok 10 arm_smmu_v3_write_ste_test_non_hitless BYPASS -> S1 requires changing overlapping bits in qword 1. The programming sequence would look like this: start qw[1] = SHCFG_INCOMING qw[1] = SHCFG_INCOMING | S1DSS qw[0] = S1 mode qw[1] = S1DSS The two states are sharing qw[1] and BYPASS ignores all of it except SHCFG_INCOMING. Since bypass would have its qw[1] marked as used due to the SHCFG there is no way to express that it is not looking at the other bits. We'd have to really start doing really hacky things like remove the SHCFG as a used field entirely - but I think if you do that you break the entire logic of the design and also go backwards to having programming that only works if STEs are constructed in certain ways. Thanks, Jason
On 15/02/2024 4:01 pm, Jason Gunthorpe wrote: > On Thu, Feb 15, 2024 at 01:49:53PM +0000, Will Deacon wrote: >> Hi Jason, >> >> On Tue, Feb 06, 2024 at 11:12:38AM -0400, Jason Gunthorpe wrote: >>> As the comment in arm_smmu_write_strtab_ent() explains, this routine has >>> been limited to only work correctly in certain scenarios that the caller >>> must ensure. Generally the caller must put the STE into ABORT or BYPASS >>> before attempting to program it to something else. >> >> This is looking pretty good now, but I have a few comments inline. > > Ok > >>> @@ -48,6 +48,21 @@ enum arm_smmu_msi_index { >>> ARM_SMMU_MAX_MSIS, >>> }; >>> >>> +struct arm_smmu_entry_writer_ops; >>> +struct arm_smmu_entry_writer { >>> + const struct arm_smmu_entry_writer_ops *ops; >>> + struct arm_smmu_master *master; >>> +}; >>> + >>> +struct arm_smmu_entry_writer_ops { >>> + unsigned int num_entry_qwords; >>> + __le64 v_bit; >>> + void (*get_used)(const __le64 *entry, __le64 *used); >>> + void (*sync)(struct arm_smmu_entry_writer *writer); >>> +}; >> >> Can we avoid the indirection for now, please? I'm sure we'll want it later >> when you extend this to CDs, but for the initial support it just makes it >> more difficult to follow the flow. Should be a trivial thing to drop, I >> hope. > > We can. Ack, the abstraction is really hard to follow, and much of that seems entirely self-inflicted in the amount of recalculating information which was in-context in a previous step but then thrown away. And as best I can tell I think it will still end up doing more CFGIs than needed. Keeping a single monolithic check-and-update function will be *so* much easier to understand and maintain. As far as CDs go, anything we might reasonably want to change in a live CD is all in the first word so I don't see any value in attempting to generalise further on that side of things. Maybe arm_smmu_write_ctx_desc() could stand to be a bit prettier, but honestly I don't think it's too bad as-is. >>> +static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) >>> { >>> + unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0])); >>> + >>> + used_bits[0] = cpu_to_le64(STRTAB_STE_0_V); >>> + if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) >>> + return; >>> + >>> + /* >>> + * See 13.5 Summary of attribute/permission configuration fields for the >>> + * SHCFG behavior. It is only used for BYPASS, including S1DSS BYPASS, >>> + * and S2 only. >>> + */ >>> + if (cfg == STRTAB_STE_0_CFG_BYPASS || >>> + cfg == STRTAB_STE_0_CFG_S2_TRANS || >>> + (cfg == STRTAB_STE_0_CFG_S1_TRANS && >>> + FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == >>> + STRTAB_STE_1_S1DSS_BYPASS)) >>> + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); >> >> Huh, SHCFG is really getting in the way here, isn't it? > > I wouldn't say that.. It is just a complicated bit of the spec. One of > the things we recently did was to audit all the cache settings and, at > least, we then realized that SHCFG was being subtly used by S2 as > well.. Yeah, that really shouldn't be subtle; incoming attributes are replaced by S1 translation, thus they are relevant to not-S1 configs. I think it's likely to be significantly more straightforward to give up on the switch statement and jump straight into the more architectural paradigm at this level, e.g. // Stage 1 if (cfg & BIT(0)) { ... } else { ... } // Stage 2 if (cfg & BIT(1)) { ... } else { ... } Thanks, Robin.
On 2024-02-15 6:42 pm, Robin Murphy wrote: [...] >>>> +static void arm_smmu_get_ste_used(const __le64 *ent, __le64 >>>> *used_bits) >>>> { >>>> + unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, >>>> le64_to_cpu(ent[0])); >>>> + >>>> + used_bits[0] = cpu_to_le64(STRTAB_STE_0_V); >>>> + if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) >>>> + return; >>>> + >>>> + /* >>>> + * See 13.5 Summary of attribute/permission configuration >>>> fields for the >>>> + * SHCFG behavior. It is only used for BYPASS, including S1DSS >>>> BYPASS, >>>> + * and S2 only. >>>> + */ >>>> + if (cfg == STRTAB_STE_0_CFG_BYPASS || >>>> + cfg == STRTAB_STE_0_CFG_S2_TRANS || >>>> + (cfg == STRTAB_STE_0_CFG_S1_TRANS && >>>> + FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == >>>> + STRTAB_STE_1_S1DSS_BYPASS)) >>>> + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); >>> >>> Huh, SHCFG is really getting in the way here, isn't it? >> >> I wouldn't say that.. It is just a complicated bit of the spec. One of >> the things we recently did was to audit all the cache settings and, at >> least, we then realized that SHCFG was being subtly used by S2 as >> well.. > > Yeah, that really shouldn't be subtle; incoming attributes are replaced > by S1 translation, thus they are relevant to not-S1 configs. That said, in this specific case I don't understand why we're worrying about SHCFG here at all - we're never going to make use of any value other than "use incoming" because we can't rely on it being implemented in the first place, and even if it is, we really don't want to start getting into the forced-coherency notion that the DMA layer can'#t understand and devicetree can't describe. We're still unconditionally setting the "use incoming" value for MTCFG, ALLOCCFG, PRIVCFG and INSTCFG without checking them, so there's no logic in pretending SHCFG is any different from its peers simply because its encoding is slightly less convenient. If the micro-optimisation of not setting it when we know it's going to be ignored anyway starts getting in the way, just drop that. Thanks, Robin.
On Thu, Feb 15, 2024 at 06:42:37PM +0000, Robin Murphy wrote: > > > > @@ -48,6 +48,21 @@ enum arm_smmu_msi_index { > > > > ARM_SMMU_MAX_MSIS, > > > > }; > > > > +struct arm_smmu_entry_writer_ops; > > > > +struct arm_smmu_entry_writer { > > > > + const struct arm_smmu_entry_writer_ops *ops; > > > > + struct arm_smmu_master *master; > > > > +}; > > > > + > > > > +struct arm_smmu_entry_writer_ops { > > > > + unsigned int num_entry_qwords; > > > > + __le64 v_bit; > > > > + void (*get_used)(const __le64 *entry, __le64 *used); > > > > + void (*sync)(struct arm_smmu_entry_writer *writer); > > > > +}; > > > > > > Can we avoid the indirection for now, please? I'm sure we'll want it later > > > when you extend this to CDs, but for the initial support it just makes it > > > more difficult to follow the flow. Should be a trivial thing to drop, I > > > hope. > > > > We can. > > Ack, the abstraction is really hard to follow, and much of that > seems entirely self-inflicted in the amount of recalculating > information which was in-context in a previous step but then thrown > away. I'm not sure I understand this can you be more specific? I don't know what we are throwing away that you see? > And as best I can tell I think it will still end up doing more CFGIs > than needed. I think we've minimized the number of steps and Michael did check it, even pushed tests for the popular scenarios into the kunit. He found a case where it was not optimal and it was improved. Mostafa asked about extra syncs, and you can read my reply explaining why. We both agreed the sync's are necessary. The only extra thing I know of is the zeroing of fields. Perhaps we don't have to do this, but I think we should. Operating with the STE in a known state seems like the conservative choice. Regardless if you have a case in mind where there are extra steps lets try it in the kunit and check. This is not a performance path, so I wouldn't invest too much in this question. > Keeping a single monolithic check-and-update function will be *so* much > easier to understand and maintain. The ops are used by the kunit test suite and I think the kunit is valuable. Further I've been looking at the AMD driver and it has the same problem to solve for its DTE and can use this same solution. Intel also has > 128 bit structures too. I already drafted an exploration of using this algorithm in AMD. I see a someday future where we will move this to shared core code. In which case the driver only provides the used and sync operation which I think is a low driver burden for solving such a tricky shared problem. There is some more shared complexity here on x86 which needs to use 128 bit stores if the CPU supports those instructions. IOW this approach is nice and valuable outside ARM. I would like to move in a direction where we simply use this shared code for all multi-qword HW descriptors. We've certainly invested enough in building it and none of the three drivers have anything better. > As far as CDs go, anything we might reasonably want to change in a > live CD is all in the first word so I don't see any value in Changing from a S1 -> S1 requires updating two qwords in the CD and that requires the V=0 flow that the current arm_smmu_write_ctx_desc() doesn't do. It is not that arm_smmu_write_ctx_desc() needs to be prettier, it needs more functionality. > > > > +static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) > > > > { > > > > + unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0])); > > > > + > > > > + used_bits[0] = cpu_to_le64(STRTAB_STE_0_V); > > > > + if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) > > > > + return; > > > > + > > > > + /* > > > > + * See 13.5 Summary of attribute/permission configuration fields for the > > > > + * SHCFG behavior. It is only used for BYPASS, including S1DSS BYPASS, > > > > + * and S2 only. > > > > + */ > > > > + if (cfg == STRTAB_STE_0_CFG_BYPASS || > > > > + cfg == STRTAB_STE_0_CFG_S2_TRANS || > > > > + (cfg == STRTAB_STE_0_CFG_S1_TRANS && > > > > + FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == > > > > + STRTAB_STE_1_S1DSS_BYPASS)) > > > > + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); > > > > > > Huh, SHCFG is really getting in the way here, isn't it? > > > > I wouldn't say that.. It is just a complicated bit of the spec. One of > > the things we recently did was to audit all the cache settings and, at > > least, we then realized that SHCFG was being subtly used by S2 as > > well.. > > Yeah, that really shouldn't be subtle; incoming attributes are replaced by > S1 translation, thus they are relevant to not-S1 configs. That is a really nice way to summarize the spec! But my remark was more about the code which isn't so obvious what value it intended to have for SHCFG on the S2 case. This doesn't really change anthing about this patch, we'd still have the above hunk to accurately reflect the SHCFG usage, and we'd still set SHCFG to 0 in S1 cases where it isn't used by HW, just like today. > I think it's likely to be significantly more straightforward to give up on > the switch statement and jump straight into the more architectural paradigm > at this level, e.g. I've thought about that, I can make effort to do this, the later nesting change would probably look nicer in this style. Thanks, Jason
On Thu, Feb 15, 2024 at 08:11:38PM +0000, Robin Murphy wrote: > On 2024-02-15 6:42 pm, Robin Murphy wrote: > [...] > > > > > +static void arm_smmu_get_ste_used(const __le64 *ent, __le64 > > > > > *used_bits) > > > > > { > > > > > + unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, > > > > > le64_to_cpu(ent[0])); > > > > > + > > > > > + used_bits[0] = cpu_to_le64(STRTAB_STE_0_V); > > > > > + if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) > > > > > + return; > > > > > + > > > > > + /* > > > > > + * See 13.5 Summary of attribute/permission > > > > > configuration fields for the > > > > > + * SHCFG behavior. It is only used for BYPASS, > > > > > including S1DSS BYPASS, > > > > > + * and S2 only. > > > > > + */ > > > > > + if (cfg == STRTAB_STE_0_CFG_BYPASS || > > > > > + cfg == STRTAB_STE_0_CFG_S2_TRANS || > > > > > + (cfg == STRTAB_STE_0_CFG_S1_TRANS && > > > > > + FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == > > > > > + STRTAB_STE_1_S1DSS_BYPASS)) > > > > > + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); > > > > > > > > Huh, SHCFG is really getting in the way here, isn't it? > > > > > > I wouldn't say that.. It is just a complicated bit of the spec. One of > > > the things we recently did was to audit all the cache settings and, at > > > least, we then realized that SHCFG was being subtly used by S2 as > > > well.. > > > > Yeah, that really shouldn't be subtle; incoming attributes are replaced > > by S1 translation, thus they are relevant to not-S1 configs. > > That said, in this specific case I don't understand why we're worrying about > SHCFG here at all - we're never going to make use of any value other than > "use incoming" because we can't rely on it being implemented in the first > place, and even if it is, we really don't want to start getting into the > forced-coherency notion that the DMA layer can'#t understand and devicetree > can't describe. Yup, that's exactly what I'm thinking. We currently set it to NSH when translation is enabled, so that the stage-2 shareability is effectively an override. However, the device is either coherent, or it isn't, and so we should just leave this always set to "use incoming" in my opinion, which means we no longer need to care about qword 1 for the bypass case. Will
On Thu, Feb 15, 2024 at 12:01:35PM -0400, Jason Gunthorpe wrote: > On Thu, Feb 15, 2024 at 01:49:53PM +0000, Will Deacon wrote: > > On Tue, Feb 06, 2024 at 11:12:38AM -0400, Jason Gunthorpe wrote: > > > @@ -48,6 +48,21 @@ enum arm_smmu_msi_index { > > > ARM_SMMU_MAX_MSIS, > > > }; > > > > > > +struct arm_smmu_entry_writer_ops; > > > +struct arm_smmu_entry_writer { > > > + const struct arm_smmu_entry_writer_ops *ops; > > > + struct arm_smmu_master *master; > > > +}; > > > + > > > +struct arm_smmu_entry_writer_ops { > > > + unsigned int num_entry_qwords; > > > + __le64 v_bit; > > > + void (*get_used)(const __le64 *entry, __le64 *used); > > > + void (*sync)(struct arm_smmu_entry_writer *writer); > > > +}; > > > > Can we avoid the indirection for now, please? I'm sure we'll want it later > > when you extend this to CDs, but for the initial support it just makes it > > more difficult to follow the flow. Should be a trivial thing to drop, I > > hope. > > We can. Thanks. > > I think it also means we don't have a "hitless" transition from > > stage-2 translation -> bypass. > > Hmm, I didn't notice that. The kunit passed: > > [ 0.511483] 1..1 > [ 0.511510] KTAP version 1 > [ 0.511551] # Subtest: arm-smmu-v3-kunit-test > [ 0.511592] # module: arm_smmu_v3_test > [ 0.511594] 1..10 > [ 0.511910] ok 1 arm_smmu_v3_write_ste_test_bypass_to_abort > [ 0.512110] ok 2 arm_smmu_v3_write_ste_test_abort_to_bypass > [ 0.512386] ok 3 arm_smmu_v3_write_ste_test_cdtable_to_abort > [ 0.512631] ok 4 arm_smmu_v3_write_ste_test_abort_to_cdtable > [ 0.512874] ok 5 arm_smmu_v3_write_ste_test_cdtable_to_bypass > [ 0.513075] ok 6 arm_smmu_v3_write_ste_test_bypass_to_cdtable > [ 0.513275] ok 7 arm_smmu_v3_write_ste_test_cdtable_s1dss_change > [ 0.513466] ok 8 arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass > [ 0.513672] ok 9 arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass > [ 0.514148] ok 10 arm_smmu_v3_write_ste_test_non_hitless > > Which I see is because it did not test the S2 case... Oops! > > Additionally, it looks like there's an existing buglet here in that we > > shouldn't set SHCFG if SMMU_IDR1.ATTR_TYPES_OVR == 0. > > Ah because the spec says RES0.. I'll add these two into the pile of > random stuff in part 3 I don't think this needs to wait until part 3, but it also doesn't need to be part of your series. I'll make a note that we can improve this. > > > + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_CFG); > > > + switch (cfg) { > > > + case STRTAB_STE_0_CFG_ABORT: > > > + case STRTAB_STE_0_CFG_BYPASS: > > > + break; > > > + case STRTAB_STE_0_CFG_S1_TRANS: > > > + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT | > > > + STRTAB_STE_0_S1CTXPTR_MASK | > > > + STRTAB_STE_0_S1CDMAX); > > > + used_bits[1] |= > > > + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | > > > + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | > > > + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW); > > > + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_EATS); > > > + used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID); > > > + break; > > > + case STRTAB_STE_0_CFG_S2_TRANS: > > > + used_bits[1] |= > > > + cpu_to_le64(STRTAB_STE_1_EATS); > > > + used_bits[2] |= > > > + cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | > > > + STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | > > > + STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R); > > > + used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK); > > > + break; > > > > With SHCFG fixed, can we go a step further with this and simply identify > > the live qwords directly, rather than on a field-by-field basis? I think > > we should be able to do the same "hitless" transitions you want with the > > coarser granularity. > > Not naively, Michael's excellent unit test shows it.. My understanding > of your idea was roughly thus: > > void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) > { > unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0])); > > used_bits[0] = U64_MAX; > if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) > return; > > /* > * See 13.5 Summary of attribute/permission configuration fields for the > * SHCFG behavior. It is only used for BYPASS, including S1DSS BYPASS, > * and S2 only. > */ > if (cfg == STRTAB_STE_0_CFG_BYPASS || > cfg == STRTAB_STE_0_CFG_S2_TRANS || > (cfg == STRTAB_STE_0_CFG_S1_TRANS && > FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == > STRTAB_STE_1_S1DSS_BYPASS)) > used_bits[1] |= U64_MAX; > > used_bits[0] |= U64_MAX; > switch (cfg) { > case STRTAB_STE_0_CFG_ABORT: > case STRTAB_STE_0_CFG_BYPASS: > break; > case STRTAB_STE_0_CFG_S1_TRANS: > used_bits[0] |= U64_MAX; > used_bits[1] |= U64_MAX; > used_bits[2] |= U64_MAX; > break; > case STRTAB_STE_0_CFG_NESTED: > used_bits[0] |= U64_MAX; > used_bits[1] |= U64_MAX; > fallthrough; > case STRTAB_STE_0_CFG_S2_TRANS: > used_bits[1] |= U64_MAX; > used_bits[2] |= U64_MAX; > used_bits[3] |= U64_MAX; > break; Very roughly, yes, although I'd go further and just return a bitmap of used qwords instead of tracking these bits. Basically, we could have some #defines saying which qwords are used by which configs, and then we can simplify the algorithm while retaining the ability to reject updates to qwords which we're not expecting. > And the failures: [...] > BYPASS -> S1 requires changing overlapping bits in qword 1. The > programming sequence would look like this: > > start qw[1] = SHCFG_INCOMING > qw[1] = SHCFG_INCOMING | S1DSS > qw[0] = S1 mode > qw[1] = S1DSS > > The two states are sharing qw[1] and BYPASS ignores all of it except > SHCFG_INCOMING. Since bypass would have its qw[1] marked as used due > to the SHCFG there is no way to express that it is not looking at the > other bits. > > We'd have to really start doing really hacky things like remove the > SHCFG as a used field entirely - but I think if you do that you break > the entire logic of the design and also go backwards to having > programming that only works if STEs are constructed in certain ways. I would actually like to remove SHCFG as a used field. If the encoding was less whacky (i.e. if 0b00 always meant "use incoming"), then it would be easy, but it shouldn't be too hard to work around that. Then BYPASS doesn't need to worry about qword 1 at all. Will
On Wed, Feb 21, 2024 at 01:49:23PM +0000, Will Deacon wrote: > Very roughly, yes, although I'd go further and just return a bitmap of > used qwords instead of tracking these bits. Basically, we could have some > #defines saying which qwords are used by which configs, I don't think this will work well for CD's EPD0 case.. static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits) { used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V); if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V))) return; memset(used_bits, 0xFF, sizeof(struct arm_smmu_cd)); /* EPD0 means T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED */ if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) { used_bits[0] &= ~cpu_to_le64( CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 | CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 | CTXDESC_CD_0_TCR_SH0); used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK); } } > and then we can > simplify the algorithm while retaining the ability to reject updates > to qwords which we're not expecting. It is not much simplification. arm_smmu_entry_qword_diff() gets a bit shorter (not that it is complex anyhow) and other stuff gets worse. > > We'd have to really start doing really hacky things like remove the > > SHCFG as a used field entirely - but I think if you do that you break > > the entire logic of the design and also go backwards to having > > programming that only works if STEs are constructed in certain ways. > > I would actually like to remove SHCFG as a used field. If the encoding > was less whacky (i.e. if 0b00 always meant "use incoming"), then it would > be easy, but it shouldn't be too hard to work around that. But why? You throw away the entire logic of the design, go back to subtly coupling the two parts, and *for what*? Exactly what are we trying to achieve in return? You haven't explained why we are still discussing this afer 7 months. It really isn't worthwhile. Jason
On Wed, Feb 21, 2024 at 10:08 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Wed, Feb 21, 2024 at 01:49:23PM +0000, Will Deacon wrote: > > > Very roughly, yes, although I'd go further and just return a bitmap of > > used qwords instead of tracking these bits. Basically, we could have some > > #defines saying which qwords are used by which configs, > > I don't think this will work well for CD's EPD0 case.. > > static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits) > { > used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V); > if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V))) > return; > memset(used_bits, 0xFF, sizeof(struct arm_smmu_cd)); > > /* EPD0 means T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED */ > if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) { > used_bits[0] &= ~cpu_to_le64( > CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 | > CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 | > CTXDESC_CD_0_TCR_SH0); > used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK); > } > } > > > and then we can > > simplify the algorithm while retaining the ability to reject updates > > to qwords which we're not expecting. > > It is not much simplification. arm_smmu_entry_qword_diff() gets a bit > shorter (not that it is complex anyhow) and other stuff gets worse. I think the simplification here is in the first if branch of arm_smmu_write_ste. With Will's proposal, we only perform a hitless update if there's a single used qword that needs updating. There's no longer a case where we first set unused bits in qwords whose other bits are in use. I'd argue that setting unused bits of a q word was very clever and removing the logic does conceptually simplify things, although yes it's not much fewer lines of code. I also don't think this throws away the entire logic of the current design, the idea of counting the number of qwords that differ and writing qwords that are unused first is still there. But, it does mean that hitless updates are only possible under narrower circumstances...We now have to figure out if there are transitions where this is problematic where we could previously assume that we'd always get the best behavior possible. Both in the present (i.e this SHCFG discussion and EPD0 case) and in the future if new parts of the configs start getting used. IMO not having to think about this is a meaningful advantage of the current solution. > > > We'd have to really start doing really hacky things like remove the > > > SHCFG as a used field entirely - but I think if you do that you break > > > the entire logic of the design and also go backwards to having > > > programming that only works if STEs are constructed in certain ways. > > > > I would actually like to remove SHCFG as a used field. If the encoding > > was less whacky (i.e. if 0b00 always meant "use incoming"), then it would > > be easy, but it shouldn't be too hard to work around that. > What do you mean by removing SHCFG as a used field? Are we changing the driver so that it only ever sets SHCFG to a single possible value? Or are we talking about fudging things and pretending it's not used when it is and might have different values?
On Thu, Feb 22, 2024 at 12:19 AM Michael Shavit <mshavit@google.com> wrote: > > On Wed, Feb 21, 2024 at 10:08 PM Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > On Wed, Feb 21, 2024 at 01:49:23PM +0000, Will Deacon wrote: > > > > > Very roughly, yes, although I'd go further and just return a bitmap of > > > used qwords instead of tracking these bits. Basically, we could have some > > > #defines saying which qwords are used by which configs, > > > > I don't think this will work well for CD's EPD0 case.. > > > > static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits) > > { > > used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V); > > if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V))) > > return; > > memset(used_bits, 0xFF, sizeof(struct arm_smmu_cd)); > > > > /* EPD0 means T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED */ > > if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) { > > used_bits[0] &= ~cpu_to_le64( > > CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 | > > CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 | > > CTXDESC_CD_0_TCR_SH0); > > used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK); > > } > > } > > > > > and then we can > > > simplify the algorithm while retaining the ability to reject updates > > > to qwords which we're not expecting. > > > > It is not much simplification. arm_smmu_entry_qword_diff() gets a bit > > shorter (not that it is complex anyhow) and other stuff gets worse. > > I think the simplification here is in the first if branch of > arm_smmu_write_ste. With Will's proposal, we only perform a hitless > update if there's a single used qword that needs updating. There's no > longer a case where we first set unused bits in qwords whose other > bits are in use. I'd argue that setting unused bits of a q word was > very clever and removing the logic does conceptually simplify things, > although yes it's not much fewer lines of code. I also don't think > this throws away the entire logic of the current design, the idea of > counting the number of qwords that differ and writing qwords that are > unused first is still there. > > But, it does mean that hitless updates are only possible under > narrower circumstances...We now have to figure out if there are > transitions where this is problematic where we could previously assume > that we'd always get the best behavior possible. Both in the present > (i.e this SHCFG discussion and EPD0 case) and in the future if new > parts of the configs start getting used. IMO not having to think about > this is a meaningful advantage of the current solution. To be more explicit, I hope we can keep the current solution. The tests we added mitigates the extra complexity, while there's no certainty that the 1-bit-per-qword proposal will always be satisfactory in the future (nor have we even reached consensus that it is satisfactory in the present with the part 2 CD series)
On Thu, Feb 22, 2024 at 12:19:06AM +0800, Michael Shavit wrote: > I think the simplification here is in the first if branch of > arm_smmu_write_ste. With Will's proposal, we only perform a hitless > update if there's a single used qword that needs updating. The normal cases like BYPASS -> S1 still require updating QW[1,2] before updating QW[0], and the reverse as well. That still needs the three entry_set()'s to process the same way. From what I can see if we did 1 bit per qw: - get_used becomes harder to explain but shorter (we ignore the used qw 1 for bypass/abort) - arm_smmu_entry_qword_diff becomes a bit simpler, less bitwise logic, no unused_update - arm_smmu_write_entry() has the same logic but unused_update is replaced by target - We have to hack something to make SHCFG=1 - change the make functions or have arm_smmu_write_ste() force SHCFG=1 - We have to write a seperate programming logic for CD - always do V=0/1 for normal updates, and a special EPD0 flow. All doable, but I don't see the benefit in aggregate.. Jason
On Wed, Feb 21, 2024 at 10:08:18AM -0400, Jason Gunthorpe wrote: > On Wed, Feb 21, 2024 at 01:49:23PM +0000, Will Deacon wrote: > > > Very roughly, yes, although I'd go further and just return a bitmap of > > used qwords instead of tracking these bits. Basically, we could have some > > #defines saying which qwords are used by which configs, > > I don't think this will work well for CD's EPD0 case.. > > static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits) > { > used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V); > if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V))) > return; > memset(used_bits, 0xFF, sizeof(struct arm_smmu_cd)); > > /* EPD0 means T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED */ > if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) { > used_bits[0] &= ~cpu_to_le64( > CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 | > CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 | > CTXDESC_CD_0_TCR_SH0); > used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK); > } > } Please can you explain more about the issue here? I know what EPDx are, but I'm not understanding why they're problematic. This presumably involves a hitless transition to/from an aborting CD? > > and then we can > > simplify the algorithm while retaining the ability to reject updates > > to qwords which we're not expecting. > > It is not much simplification. arm_smmu_entry_qword_diff() gets a bit > shorter (not that it is complex anyhow) and other stuff gets worse. > > > > We'd have to really start doing really hacky things like remove the > > > SHCFG as a used field entirely - but I think if you do that you break > > > the entire logic of the design and also go backwards to having > > > programming that only works if STEs are constructed in certain ways. > > > > I would actually like to remove SHCFG as a used field. If the encoding > > was less whacky (i.e. if 0b00 always meant "use incoming"), then it would > > be easy, but it shouldn't be too hard to work around that. > > But why? > > You throw away the entire logic of the design, go back to subtly > coupling the two parts, and *for what*? Exactly what are we trying to > achieve in return? You haven't explained why we are still discussing > this afer 7 months. It really isn't worthwhile. I'm just trying to avoid introducing dynamic behaviours to the driver which aren't actually used, and per-qword tracking feels like an easier way to maintain the hitless updates for the cases you care about. It's really not about throwing away the entire logic of the design -- as I said, I think this is looking pretty good. I'm also absolutely open to being convinced that per-field makes more sense and per-qword is terrible, so I'd really like to understand the E0PD case more. As an aside: is this per-field/per-qword discussion the only thing holding up a v6? With the rest of the feedback addressed and a version of Michael's selftest that exercises stage-2 translating domains, I'd like to think we could get it queued up soon. Cheers, Will
On Thu, Feb 22, 2024 at 05:43:46PM +0000, Will Deacon wrote: > On Wed, Feb 21, 2024 at 10:08:18AM -0400, Jason Gunthorpe wrote: > > On Wed, Feb 21, 2024 at 01:49:23PM +0000, Will Deacon wrote: > > > > > Very roughly, yes, although I'd go further and just return a bitmap of > > > used qwords instead of tracking these bits. Basically, we could have some > > > #defines saying which qwords are used by which configs, > > > > I don't think this will work well for CD's EPD0 case.. > > > > static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits) > > { > > used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V); > > if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V))) > > return; > > memset(used_bits, 0xFF, sizeof(struct arm_smmu_cd)); > > > > /* EPD0 means T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED */ > > if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) { > > used_bits[0] &= ~cpu_to_le64( > > CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 | > > CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 | > > CTXDESC_CD_0_TCR_SH0); > > used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK); > > } > > } > > Please can you explain more about the issue here? I know what EPDx are, > but I'm not understanding why they're problematic. This presumably > involves a hitless transition to/from an aborting CD? When a process using SVA exits uncleanly the MM is released so the SMMU HW must stop chasing the page table pointers since all that memory will be freed. However, in an unclean exit we can't control the order of shutdown so something like uacce or RDMA may not have quieted the DMA device yet. So there is a period during shutdown where the mm has been released and the device is doing DMA, the desire is that the DMA continue to be handled as a PRI and the SW will return failure for all PRI requests. Specifically we do not want to trigger any dmesg log events during this condition. Jean-Philippe came up with this solution where we hitlessly use EPD0 in release to allow the mm to release the page table while continuing to use the PRI flow. So it is going from a "SVA domain with a page table" to a "SVA domain without a page table but EPD0 set", hitlessly. > I'm just trying to avoid introducing dynamic behaviours to the driver > which aren't actually used, and per-qword tracking feels like an easier > way to maintain the hitless updates for the cases you care about. It's > really not about throwing away the entire logic of the design -- as I > said, I think this is looking pretty good. I'm also absolutely open to > being convinced that per-field makes more sense and per-qword is terrible, > so I'd really like to understand the E0PD case more. It is not more sense/terrible, it is more that we have to make some trade offs. I outlined what I think would be needed to make per-qw work in the other email: - get_used becomes harder to explain but shorter (we ignore the used qw 1 for bypass/abort) - arm_smmu_entry_qword_diff becomes a bit simpler, less bitwise logic, no unused_update - arm_smmu_write_entry() has the same logic but unused_update is replaced by target - We have to hack something to make SHCFG=1 - change the make functions or have arm_smmu_write_ste() force SHCFG=1. - We have to write a seperate programming logic for CD - always do V=0/1 for normal updates, and a special EPD0 flow. I think it is worse over all because none of those trade offs really make the code clearer, and I dislike the idea of open coding CD. Especially now that we have a test suite that requires the ops anyhow. It is a minor decision, trust Michael and I make this choice, we both agree now and have spent alot of time studying this. > As an aside: is this per-field/per-qword discussion the only thing holding > up a v6? As far as I know, yes. I have not typed in every feedback yet, but I hope to get that done today. I will try to post it by Monday so we can see what it looks like with Robin's suggestion but without per-qw. > With the rest of the feedback addressed and a version of Michael's > selftest that exercises stage-2 translating domains, I'd like to > think we could get it queued up soon. I would really like this, we have so many more patches to work on, you probably saw the HTTU stuff was re posted again, we have a clean full BTM enablement now on the list for the first time, nesting patches, and more. Including this, I'm tracking a work list of about 100-150 patches for SMMUv3 in the next little bit. This is not unique to SMMUv3, AMD is on part 6 of work for their driver, and Intel has been pushing ~10-20 patches/cycle pretty reliably. iommufd has opened the door to actually solving alot of the stuck problems and everyone is rushing to complete their previously stalled HW enablement. I have to review and help design all of this work too! :) BTW Michael's self test won't be in part 1 because it needs the ops to be restored in order to work (now done in part 2), and has a few other more minor dependencies on part 2 and 3. Thanks, Jason
On Fri, Feb 23, 2024 at 11:18:41AM -0400, Jason Gunthorpe wrote: > On Thu, Feb 22, 2024 at 05:43:46PM +0000, Will Deacon wrote: > > On Wed, Feb 21, 2024 at 10:08:18AM -0400, Jason Gunthorpe wrote: > > > On Wed, Feb 21, 2024 at 01:49:23PM +0000, Will Deacon wrote: > > > > > > > Very roughly, yes, although I'd go further and just return a bitmap of > > > > used qwords instead of tracking these bits. Basically, we could have some > > > > #defines saying which qwords are used by which configs, > > > > > > I don't think this will work well for CD's EPD0 case.. > > > > > > static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits) > > > { > > > used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V); > > > if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V))) > > > return; > > > memset(used_bits, 0xFF, sizeof(struct arm_smmu_cd)); > > > > > > /* EPD0 means T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED */ > > > if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) { > > > used_bits[0] &= ~cpu_to_le64( > > > CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 | > > > CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 | > > > CTXDESC_CD_0_TCR_SH0); > > > used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK); > > > } > > > } > > > > Please can you explain more about the issue here? I know what EPDx are, > > but I'm not understanding why they're problematic. This presumably > > involves a hitless transition to/from an aborting CD? > > When a process using SVA exits uncleanly the MM is released so the > SMMU HW must stop chasing the page table pointers since all that > memory will be freed. > > However, in an unclean exit we can't control the order of shutdown so > something like uacce or RDMA may not have quieted the DMA device yet. > > So there is a period during shutdown where the mm has been released > and the device is doing DMA, the desire is that the DMA continue to be > handled as a PRI and the SW will return failure for all PRI requests. > > Specifically we do not want to trigger any dmesg log events during > this condition. Curious, but why is it problematic to log events? As you say, it's an "unclean" exit, so it doesn't seem that unreasonable to me. > Jean-Philippe came up with this solution where we hitlessly use EPD0 > in release to allow the mm to release the page table while continuing > to use the PRI flow. > > So it is going from a "SVA domain with a page table" to a "SVA domain > without a page table but EPD0 set", hitlessly. Ok, and so the reason this adds complexity is because the set of used bits/qwords changes based on something other than the cfg? I think it's a pretty weak argument for field vs qwords, but it's a good counter-example to my naive approach of per-config masks, so thanks. > BTW Michael's self test won't be in part 1 because it needs the ops to > be restored in order to work (now done in part 2), and has a few other > more minor dependencies on part 2 and 3. That's a pity, but fair enough. Will
On Tue, Feb 27, 2024 at 12:43:18PM +0000, Will Deacon wrote: > On Fri, Feb 23, 2024 at 11:18:41AM -0400, Jason Gunthorpe wrote: > > On Thu, Feb 22, 2024 at 05:43:46PM +0000, Will Deacon wrote: > > > On Wed, Feb 21, 2024 at 10:08:18AM -0400, Jason Gunthorpe wrote: > > > > On Wed, Feb 21, 2024 at 01:49:23PM +0000, Will Deacon wrote: > > > > > > > > > Very roughly, yes, although I'd go further and just return a bitmap of > > > > > used qwords instead of tracking these bits. Basically, we could have some > > > > > #defines saying which qwords are used by which configs, > > > > > > > > I don't think this will work well for CD's EPD0 case.. > > > > > > > > static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits) > > > > { > > > > used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V); > > > > if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V))) > > > > return; > > > > memset(used_bits, 0xFF, sizeof(struct arm_smmu_cd)); > > > > > > > > /* EPD0 means T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED */ > > > > if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) { > > > > used_bits[0] &= ~cpu_to_le64( > > > > CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 | > > > > CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 | > > > > CTXDESC_CD_0_TCR_SH0); > > > > used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK); > > > > } > > > > } > > > > > > Please can you explain more about the issue here? I know what EPDx are, > > > but I'm not understanding why they're problematic. This presumably > > > involves a hitless transition to/from an aborting CD? > > > > When a process using SVA exits uncleanly the MM is released so the > > SMMU HW must stop chasing the page table pointers since all that > > memory will be freed. > > > > However, in an unclean exit we can't control the order of shutdown so > > something like uacce or RDMA may not have quieted the DMA device yet. > > > > So there is a period during shutdown where the mm has been released > > and the device is doing DMA, the desire is that the DMA continue to be > > handled as a PRI and the SW will return failure for all PRI requests. > > > > Specifically we do not want to trigger any dmesg log events during > > this condition. > > Curious, but why is it problematic to log events? As you say, it's an > "unclean" exit, so it doesn't seem that unreasonable to me. Well, I would defer to Jean-Philippe, but I can understand the logic. A user ctrl-c's their application it is not nice to get some dmesg logs from that. I recall he felt strongly about this, we had some discussion about it related to the mmu notifiers back when the iommu drivers were all updated to the new notifier API I built... > > Jean-Philippe came up with this solution where we hitlessly use EPD0 > > in release to allow the mm to release the page table while continuing > > to use the PRI flow. > > > > So it is going from a "SVA domain with a page table" to a "SVA domain > > without a page table but EPD0 set", hitlessly. > > Ok, and so the reason this adds complexity is because the set of used > bits/qwords changes based on something other than the cfg? There is no cfg for CD entries? I think it is the same issue as SHCFG, qw1 of CD is not neatly split and qw0/1 are both changing for the EPD0 case - we also zero the unused TCR/TTB. > I think it's a pretty weak argument for field vs qwords, but it's a > good counter-example to my naive approach of per-config masks, so > thanks. We could do EPD0 just by editting in place, it would be easy to code, but the point of this design was to never edit a descriptor in place. Jason
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 0ffb1cf17e0b2e..f0b915567cbcdc 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -48,6 +48,21 @@ enum arm_smmu_msi_index { ARM_SMMU_MAX_MSIS, }; +struct arm_smmu_entry_writer_ops; +struct arm_smmu_entry_writer { + const struct arm_smmu_entry_writer_ops *ops; + struct arm_smmu_master *master; +}; + +struct arm_smmu_entry_writer_ops { + unsigned int num_entry_qwords; + __le64 v_bit; + void (*get_used)(const __le64 *entry, __le64 *used); + void (*sync)(struct arm_smmu_entry_writer *writer); +}; + +#define NUM_ENTRY_QWORDS (sizeof(struct arm_smmu_ste) / sizeof(u64)) + static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = { [EVTQ_MSI_INDEX] = { ARM_SMMU_EVTQ_IRQ_CFG0, @@ -971,6 +986,140 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid) arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } +/* + * Figure out if we can do a hitless update of entry to become target. Returns a + * bit mask where 1 indicates that qword needs to be set disruptively. + * unused_update is an intermediate value of entry that has unused bits set to + * their new values. + */ +static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer, + const __le64 *entry, const __le64 *target, + __le64 *unused_update) +{ + __le64 target_used[NUM_ENTRY_QWORDS] = {}; + __le64 cur_used[NUM_ENTRY_QWORDS] = {}; + u8 used_qword_diff = 0; + unsigned int i; + + writer->ops->get_used(entry, cur_used); + writer->ops->get_used(target, target_used); + + for (i = 0; i != writer->ops->num_entry_qwords; i++) { + /* + * Check that masks are up to date, the make functions are not + * allowed to set a bit to 1 if the used function doesn't say it + * is used. + */ + WARN_ON_ONCE(target[i] & ~target_used[i]); + + /* Bits can change because they are not currently being used */ + unused_update[i] = (entry[i] & cur_used[i]) | + (target[i] & ~cur_used[i]); + /* + * Each bit indicates that a used bit in a qword needs to be + * changed after unused_update is applied. + */ + if ((unused_update[i] & target_used[i]) != target[i]) + used_qword_diff |= 1 << i; + } + return used_qword_diff; +} + +static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry, + const __le64 *target, unsigned int start, + unsigned int len) +{ + bool changed = false; + unsigned int i; + + for (i = start; len != 0; len--, i++) { + if (entry[i] != target[i]) { + WRITE_ONCE(entry[i], target[i]); + changed = true; + } + } + + if (changed) + writer->ops->sync(writer); + return changed; +} + +/* + * Update the STE/CD to the target configuration. The transition from the + * current entry to the target entry takes place over multiple steps that + * attempts to make the transition hitless if possible. This function takes care + * not to create a situation where the HW can perceive a corrupted entry. HW is + * only required to have a 64 bit atomicity with stores from the CPU, while + * entries are many 64 bit values big. + * + * The difference between the current value and the target value is analyzed to + * determine which of three updates are required - disruptive, hitless or no + * change. + * + * In the most general disruptive case we can make any update in three steps: + * - Disrupting the entry (V=0) + * - Fill now unused qwords, execpt qword 0 which contains V + * - Make qword 0 have the final value and valid (V=1) with a single 64 + * bit store + * + * However this disrupts the HW while it is happening. There are several + * interesting cases where a STE/CD can be updated without disturbing the HW + * because only a small number of bits are changing (S1DSS, CONFIG, etc) or + * because the used bits don't intersect. We can detect this by calculating how + * many 64 bit values need update after adjusting the unused bits and skip the + * V=0 process. This relies on the IGNORED behavior described in the + * specification. + */ +static void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, + __le64 *entry, const __le64 *target) +{ + unsigned int num_entry_qwords = writer->ops->num_entry_qwords; + __le64 unused_update[NUM_ENTRY_QWORDS]; + u8 used_qword_diff; + + used_qword_diff = + arm_smmu_entry_qword_diff(writer, entry, target, unused_update); + if (hweight8(used_qword_diff) == 1) { + /* + * Only one qword needs its used bits to be changed. This is a + * hitless update, update all bits the current STE is ignoring + * to their new values, then update a single "critical qword" to + * change the STE and finally 0 out any bits that are now unused + * in the target configuration. + */ + unsigned int critical_qword_index = ffs(used_qword_diff) - 1; + + /* + * Skip writing unused bits in the critical qword since we'll be + * writing it in the next step anyways. This can save a sync + * when the only change is in that qword. + */ + unused_update[critical_qword_index] = + entry[critical_qword_index]; + entry_set(writer, entry, unused_update, 0, num_entry_qwords); + entry_set(writer, entry, target, critical_qword_index, 1); + entry_set(writer, entry, target, 0, num_entry_qwords); + } else if (used_qword_diff) { + /* + * At least two qwords need their inuse bits to be changed. This + * requires a breaking update, zero the V bit, write all qwords + * but 0, then set qword 0 + */ + unused_update[0] = entry[0] & (~writer->ops->v_bit); + entry_set(writer, entry, unused_update, 0, 1); + entry_set(writer, entry, target, 1, num_entry_qwords - 1); + entry_set(writer, entry, target, 0, 1); + } else { + /* + * No inuse bit changed. Sanity check that all unused bits are 0 + * in the entry. The target was already sanity checked by + * compute_qword_diff(). + */ + WARN_ON_ONCE( + entry_set(writer, entry, target, 0, num_entry_qwords)); + } +} + static void arm_smmu_sync_cd(struct arm_smmu_master *master, int ssid, bool leaf) { @@ -1238,50 +1387,126 @@ arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc) WRITE_ONCE(*dst, cpu_to_le64(val)); } -static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid) +struct arm_smmu_ste_writer { + struct arm_smmu_entry_writer writer; + u32 sid; +}; + +/* + * Based on the value of ent report which bits of the STE the HW will access. It + * would be nice if this was complete according to the spec, but minimally it + * has to capture the bits this driver uses. + */ +static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) { + unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0])); + + used_bits[0] = cpu_to_le64(STRTAB_STE_0_V); + if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V))) + return; + + /* + * See 13.5 Summary of attribute/permission configuration fields for the + * SHCFG behavior. It is only used for BYPASS, including S1DSS BYPASS, + * and S2 only. + */ + if (cfg == STRTAB_STE_0_CFG_BYPASS || + cfg == STRTAB_STE_0_CFG_S2_TRANS || + (cfg == STRTAB_STE_0_CFG_S1_TRANS && + FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) == + STRTAB_STE_1_S1DSS_BYPASS)) + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG); + + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_CFG); + switch (cfg) { + case STRTAB_STE_0_CFG_ABORT: + case STRTAB_STE_0_CFG_BYPASS: + break; + case STRTAB_STE_0_CFG_S1_TRANS: + used_bits[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT | + STRTAB_STE_0_S1CTXPTR_MASK | + STRTAB_STE_0_S1CDMAX); + used_bits[1] |= + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW); + used_bits[1] |= cpu_to_le64(STRTAB_STE_1_EATS); + used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID); + break; + case STRTAB_STE_0_CFG_S2_TRANS: + used_bits[1] |= + cpu_to_le64(STRTAB_STE_1_EATS); + used_bits[2] |= + cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | + STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | + STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R); + used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK); + break; + + default: + memset(used_bits, 0xFF, sizeof(struct arm_smmu_ste)); + WARN_ON(true); + } +} + +static void arm_smmu_ste_writer_sync_entry(struct arm_smmu_entry_writer *writer) +{ + struct arm_smmu_ste_writer *ste_writer = + container_of(writer, struct arm_smmu_ste_writer, writer); struct arm_smmu_cmdq_ent cmd = { .opcode = CMDQ_OP_CFGI_STE, .cfgi = { - .sid = sid, + .sid = ste_writer->sid, .leaf = true, }, }; - arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); + arm_smmu_cmdq_issue_cmd_with_sync(writer->master->smmu, &cmd); +} + +static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = { + .sync = arm_smmu_ste_writer_sync_entry, + .get_used = arm_smmu_get_ste_used, + .v_bit = cpu_to_le64(STRTAB_STE_0_V), + .num_entry_qwords = sizeof(struct arm_smmu_ste) / sizeof(u64), +}; + +static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, + struct arm_smmu_ste *ste, + const struct arm_smmu_ste *target) +{ + struct arm_smmu_device *smmu = master->smmu; + struct arm_smmu_ste_writer ste_writer = { + .writer = { + .ops = &arm_smmu_ste_writer_ops, + .master = master, + }, + .sid = sid, + }; + + arm_smmu_write_entry(&ste_writer.writer, ste->data, target->data); + + /* It's likely that we'll want to use the new STE soon */ + if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) { + struct arm_smmu_cmdq_ent + prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG, + .prefetch = { + .sid = sid, + } }; + + arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd); + } } static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, struct arm_smmu_ste *dst) { - /* - * This is hideously complicated, but we only really care about - * three cases at the moment: - * - * 1. Invalid (all zero) -> bypass/fault (init) - * 2. Bypass/fault -> translation/bypass (attach) - * 3. Translation/bypass -> bypass/fault (detach) - * - * Given that we can't update the STE atomically and the SMMU - * doesn't read the thing in a defined order, that leaves us - * with the following maintenance requirements: - * - * 1. Update Config, return (init time STEs aren't live) - * 2. Write everything apart from dword 0, sync, write dword 0, sync - * 3. Update Config, sync - */ - u64 val = le64_to_cpu(dst->data[0]); - bool ste_live = false; + u64 val; struct arm_smmu_device *smmu = master->smmu; struct arm_smmu_ctx_desc_cfg *cd_table = NULL; struct arm_smmu_s2_cfg *s2_cfg = NULL; struct arm_smmu_domain *smmu_domain = master->domain; - struct arm_smmu_cmdq_ent prefetch_cmd = { - .opcode = CMDQ_OP_PREFETCH_CFG, - .prefetch = { - .sid = sid, - }, - }; + struct arm_smmu_ste target = {}; if (smmu_domain) { switch (smmu_domain->stage) { @@ -1296,22 +1521,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, } } - if (val & STRTAB_STE_0_V) { - switch (FIELD_GET(STRTAB_STE_0_CFG, val)) { - case STRTAB_STE_0_CFG_BYPASS: - break; - case STRTAB_STE_0_CFG_S1_TRANS: - case STRTAB_STE_0_CFG_S2_TRANS: - ste_live = true; - break; - case STRTAB_STE_0_CFG_ABORT: - BUG_ON(!disable_bypass); - break; - default: - BUG(); /* STE corruption */ - } - } - /* Nuke the existing STE_0 value, as we're going to rewrite it */ val = STRTAB_STE_0_V; @@ -1322,16 +1531,11 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, else val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS); - dst->data[0] = cpu_to_le64(val); - dst->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, + target.data[0] = cpu_to_le64(val); + target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); - dst->data[2] = 0; /* Nuke the VMID */ - /* - * The SMMU can perform negative caching, so we must sync - * the STE regardless of whether the old value was live. - */ - if (smmu) - arm_smmu_sync_ste_for_sid(smmu, sid); + target.data[2] = 0; /* Nuke the VMID */ + arm_smmu_write_ste(master, sid, dst, &target); return; } @@ -1339,8 +1543,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ? STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1; - BUG_ON(ste_live); - dst->data[1] = cpu_to_le64( + target.data[1] = cpu_to_le64( FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) | FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) | FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) | @@ -1349,7 +1552,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, if (smmu->features & ARM_SMMU_FEAT_STALLS && !master->stall_enabled) - dst->data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); + target.data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD); val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) | FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) | @@ -1358,8 +1561,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, } if (s2_cfg) { - BUG_ON(ste_live); - dst->data[2] = cpu_to_le64( + target.data[2] = cpu_to_le64( FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) | FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) | #ifdef __BIG_ENDIAN @@ -1368,23 +1570,17 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2R); - dst->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); + target.data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK); val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS); } if (master->ats_enabled) - dst->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, + target.data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS)); - arm_smmu_sync_ste_for_sid(smmu, sid); - /* See comment in arm_smmu_write_ctx_desc() */ - WRITE_ONCE(dst->data[0], cpu_to_le64(val)); - arm_smmu_sync_ste_for_sid(smmu, sid); - - /* It's likely that we'll want to use the new STE soon */ - if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) - arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd); + target.data[0] = cpu_to_le64(val); + arm_smmu_write_ste(master, sid, dst, &target); } static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab,