diff mbox series

Revert "iommu/arm-smmu: Convert to domain_alloc_paging()"

Message ID 20240213-iommu-revert-domain-alloc-v1-1-325ff55dece4@linaro.org (mailing list archive)
State New, archived
Headers show
Series Revert "iommu/arm-smmu: Convert to domain_alloc_paging()" | expand

Commit Message

Dmitry Baryshkov Feb. 13, 2024, 11:31 a.m. UTC
This reverts commit 9b3febc3a3da ("iommu/arm-smmu: Convert to
domain_alloc_paging()"). It breaks Qualcomm MSM8996 platform. Calling
arm_smmu_write_context_bank() from new codepath results in the platform
being reset because of the unclocked hardware access.

Fixes: 9b3febc3a3da ("iommu/arm-smmu: Convert to domain_alloc_paging()")
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)


---
base-commit: 46d4e2eb58e14c8935fa0e27d16d4c62ef82849a
change-id: 20240213-iommu-revert-domain-alloc-fa729e537df5

Best regards,

Comments

Robin Murphy Feb. 13, 2024, 11:38 a.m. UTC | #1
On 2024-02-13 11:31 am, Dmitry Baryshkov wrote:
> This reverts commit 9b3febc3a3da ("iommu/arm-smmu: Convert to
> domain_alloc_paging()"). It breaks Qualcomm MSM8996 platform. Calling
> arm_smmu_write_context_bank() from new codepath results in the platform
> being reset because of the unclocked hardware access.

Acked-by: Robin Murphy <robin.murphy@arm.com>

> Fixes: 9b3febc3a3da ("iommu/arm-smmu: Convert to domain_alloc_paging()")
> Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
> ---
>   drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 ++++++-----------
>   1 file changed, 6 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> index 68b6bc5e7c71..6317aaf7b3ab 100644
> --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
> +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
> @@ -859,10 +859,14 @@ static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain)
>   	arm_smmu_rpm_put(smmu);
>   }
>   
> -static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
> +static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
>   {
>   	struct arm_smmu_domain *smmu_domain;
>   
> +	if (type != IOMMU_DOMAIN_UNMANAGED) {
> +		if (using_legacy_binding || type != IOMMU_DOMAIN_DMA)
> +			return NULL;
> +	}
>   	/*
>   	 * Allocate the domain and initialise some of its data structures.
>   	 * We can't really do anything meaningful until we've added a
> @@ -875,15 +879,6 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
>   	mutex_init(&smmu_domain->init_mutex);
>   	spin_lock_init(&smmu_domain->cb_lock);
>   
> -	if (dev) {
> -		struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
> -
> -		if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) {
> -			kfree(smmu_domain);
> -			return NULL;
> -		}
> -	}
> -
>   	return &smmu_domain->domain;
>   }
>   
> @@ -1600,7 +1595,7 @@ static struct iommu_ops arm_smmu_ops = {
>   	.identity_domain	= &arm_smmu_identity_domain,
>   	.blocked_domain		= &arm_smmu_blocked_domain,
>   	.capable		= arm_smmu_capable,
> -	.domain_alloc_paging	= arm_smmu_domain_alloc_paging,
> +	.domain_alloc		= arm_smmu_domain_alloc,
>   	.probe_device		= arm_smmu_probe_device,
>   	.release_device		= arm_smmu_release_device,
>   	.probe_finalize		= arm_smmu_probe_finalize,
> 
> ---
> base-commit: 46d4e2eb58e14c8935fa0e27d16d4c62ef82849a
> change-id: 20240213-iommu-revert-domain-alloc-fa729e537df5
> 
> Best regards,
Jason Gunthorpe Feb. 13, 2024, 12:12 p.m. UTC | #2
On Tue, Feb 13, 2024 at 01:31:56PM +0200, Dmitry Baryshkov wrote:
> This reverts commit 9b3febc3a3da ("iommu/arm-smmu: Convert to
> domain_alloc_paging()"). It breaks Qualcomm MSM8996 platform. Calling
> arm_smmu_write_context_bank() from new codepath results in the platform
> being reset because of the unclocked hardware access.
> 
> Fixes: 9b3febc3a3da ("iommu/arm-smmu: Convert to domain_alloc_paging()")
> Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
> ---
>  drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 ++++++-----------
>  1 file changed, 6 insertions(+), 11 deletions(-)

Please no, as I said in the other email the only thing that should be
reverted is this:

> @@ -875,15 +879,6 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
>  	mutex_init(&smmu_domain->init_mutex);
>  	spin_lock_init(&smmu_domain->cb_lock);
>  
> -	if (dev) {
> -		struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
> -
> -		if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) {
> -			kfree(smmu_domain);
> -			return NULL;
> -		}
> -	}
> -
>  	return &smmu_domain->domain;
>  }

Everything else is fine, you already tested with that arrangement.

Jason
Will Deacon Feb. 13, 2024, 12:19 p.m. UTC | #3
On Tue, Feb 13, 2024 at 08:12:57AM -0400, Jason Gunthorpe wrote:
> On Tue, Feb 13, 2024 at 01:31:56PM +0200, Dmitry Baryshkov wrote:
> > This reverts commit 9b3febc3a3da ("iommu/arm-smmu: Convert to
> > domain_alloc_paging()"). It breaks Qualcomm MSM8996 platform. Calling
> > arm_smmu_write_context_bank() from new codepath results in the platform
> > being reset because of the unclocked hardware access.
> > 
> > Fixes: 9b3febc3a3da ("iommu/arm-smmu: Convert to domain_alloc_paging()")
> > Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
> > ---
> >  drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 ++++++-----------
> >  1 file changed, 6 insertions(+), 11 deletions(-)
> 
> Please no, as I said in the other email the only thing that should be
> reverted is this:
> 
> > @@ -875,15 +879,6 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
> >  	mutex_init(&smmu_domain->init_mutex);
> >  	spin_lock_init(&smmu_domain->cb_lock);
> >  
> > -	if (dev) {
> > -		struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
> > -
> > -		if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) {
> > -			kfree(smmu_domain);
> > -			return NULL;
> > -		}
> > -	}
> > -
> >  	return &smmu_domain->domain;
> >  }
> 
> Everything else is fine, you already tested with that arrangement.

Partial reverts are a recipe for confusion, so I'll take this and if you'd
like to bring back some of the hunks, please can you send a patch on top
that does that?

Cheers,

Will
Jason Gunthorpe Feb. 13, 2024, 12:53 p.m. UTC | #4
On Tue, Feb 13, 2024 at 12:19:34PM +0000, Will Deacon wrote:
> > > @@ -875,15 +879,6 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
> > >  	mutex_init(&smmu_domain->init_mutex);
> > >  	spin_lock_init(&smmu_domain->cb_lock);
> > >  
> > > -	if (dev) {
> > > -		struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
> > > -
> > > -		if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) {
> > > -			kfree(smmu_domain);
> > > -			return NULL;
> > > -		}
> > > -	}
> > > -
> > >  	return &smmu_domain->domain;
> > >  }
> > 
> > Everything else is fine, you already tested with that arrangement.
> 
> Partial reverts are a recipe for confusion, so I'll take this and if you'd
> like to bring back some of the hunks, please can you send a patch on top
> that does that?

The typical kernel standard is to fix bugs in patches and only reach
for a wholesale revert if the community is struggling with bug
fixing. Dmitry already tested removing that hunk, Robin explained the
issue, we understand the bug fix is to remove the
arm_smmu_init_domain_context() call. Nothing justifies a full scale
revert.

I'll send another patch if you want, but it seems like a waste of all
our time.

Jason
Will Deacon Feb. 13, 2024, 12:59 p.m. UTC | #5
On Tue, Feb 13, 2024 at 08:53:03AM -0400, Jason Gunthorpe wrote:
> On Tue, Feb 13, 2024 at 12:19:34PM +0000, Will Deacon wrote:
> > > > @@ -875,15 +879,6 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
> > > >  	mutex_init(&smmu_domain->init_mutex);
> > > >  	spin_lock_init(&smmu_domain->cb_lock);
> > > >  
> > > > -	if (dev) {
> > > > -		struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
> > > > -
> > > > -		if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) {
> > > > -			kfree(smmu_domain);
> > > > -			return NULL;
> > > > -		}
> > > > -	}
> > > > -
> > > >  	return &smmu_domain->domain;
> > > >  }
> > > 
> > > Everything else is fine, you already tested with that arrangement.
> > 
> > Partial reverts are a recipe for confusion, so I'll take this and if you'd
> > like to bring back some of the hunks, please can you send a patch on top
> > that does that?
> 
> The typical kernel standard is to fix bugs in patches and only reach
> for a wholesale revert if the community is struggling with bug
> fixing. Dmitry already tested removing that hunk, Robin explained the
> issue, we understand the bug fix is to remove the
> arm_smmu_init_domain_context() call. Nothing justifies a full scale
> revert.

I can't say I'm aware of any consensus for how to handle this, to be
completely honest with you. I just personally see a lot of benefit in
reverting to a known-good state, especially when the revert has been
posted by the bug reporter. Then we can add stuff on top of that known
good state without having to worry about any other problems that we're
yet to uncover. Doesn't really sound controversial...

> I'll send another patch if you want, but it seems like a waste of all
> our time.

It's a bug fix, of course it's a waste of time! We're talking minutes
though, right?

Will
Jason Gunthorpe Feb. 13, 2024, 1:47 p.m. UTC | #6
On Tue, Feb 13, 2024 at 12:59:52PM +0000, Will Deacon wrote:
> > The typical kernel standard is to fix bugs in patches and only reach
> > for a wholesale revert if the community is struggling with bug
> > fixing. Dmitry already tested removing that hunk, Robin explained the
> > issue, we understand the bug fix is to remove the
> > arm_smmu_init_domain_context() call. Nothing justifies a full scale
> > revert.
> 
> I can't say I'm aware of any consensus for how to handle this, to be
> completely honest with you. 

Well, I work in a lot of subsystems and this is a surprise to me and
not something I've seen before. Fix the bug, move forward. Reverts are
a cultural admission of failure. I use threats of a revert as a hammer
to encourage people to pay attention to the bugs. I hardly ever
actually revert things. What does reverting their code say to my
submitters???

> > I'll send another patch if you want, but it seems like a waste of all
> > our time.
> 
> It's a bug fix, of course it's a waste of time! We're talking minutes
> though, right?

It becomes hard for maintainers to juggle the tress since you have to
send the revert to -rc and the fix on top of the rc to next. Again, I
will send the patch, just discussing.

Jason
Will Deacon Feb. 13, 2024, 2:09 p.m. UTC | #7
Hey Jason,

On Tue, Feb 13, 2024 at 09:47:26AM -0400, Jason Gunthorpe wrote:
> On Tue, Feb 13, 2024 at 12:59:52PM +0000, Will Deacon wrote:
> > > The typical kernel standard is to fix bugs in patches and only reach
> > > for a wholesale revert if the community is struggling with bug
> > > fixing. Dmitry already tested removing that hunk, Robin explained the
> > > issue, we understand the bug fix is to remove the
> > > arm_smmu_init_domain_context() call. Nothing justifies a full scale
> > > revert.
> > 
> > I can't say I'm aware of any consensus for how to handle this, to be
> > completely honest with you. 
> 
> Well, I work in a lot of subsystems and this is a surprise to me and
> not something I've seen before. Fix the bug, move forward. Reverts are
> a cultural admission of failure. I use threats of a revert as a hammer
> to encourage people to pay attention to the bugs. I hardly ever
> actually revert things. What does reverting their code say to my
> submitters???

Huh. I guess I'm lucky never to have worked in a environment where that
is the case. In fact, my experience is quite the opposite: revert first
so that things get back to a working state and the developer/submitter
has some breathing room to rework the broken code. It's actually fairly
blameless if you get it right and when you have a half-functional CI it's
pretty much a necessity. Anyway, I digress...

So if you see me appearing to be heavy-handed with reverts when dealing
with regressions, it's really nothing tactical. Rather, it's purely
about keeping the driver in a known functional state.

> > > I'll send another patch if you want, but it seems like a waste of all
> > > our time.
> > 
> > It's a bug fix, of course it's a waste of time! We're talking minutes
> > though, right?
> 
> It becomes hard for maintainers to juggle the tress since you have to
> send the revert to -rc and the fix on top of the rc to next. Again, I
> will send the patch, just discussing.

I've never had any difficulty managing that, so I think we'll be ok.
Worst case, I can merge my fixes queue into my next queue.

Will
Jason Gunthorpe Feb. 13, 2024, 2:42 p.m. UTC | #8
On Tue, Feb 13, 2024 at 02:09:00PM +0000, Will Deacon wrote:
> Hey Jason,
> 
> On Tue, Feb 13, 2024 at 09:47:26AM -0400, Jason Gunthorpe wrote:
> > On Tue, Feb 13, 2024 at 12:59:52PM +0000, Will Deacon wrote:
> > > > The typical kernel standard is to fix bugs in patches and only reach
> > > > for a wholesale revert if the community is struggling with bug
> > > > fixing. Dmitry already tested removing that hunk, Robin explained the
> > > > issue, we understand the bug fix is to remove the
> > > > arm_smmu_init_domain_context() call. Nothing justifies a full scale
> > > > revert.
> > > 
> > > I can't say I'm aware of any consensus for how to handle this, to be
> > > completely honest with you. 
> > 
> > Well, I work in a lot of subsystems and this is a surprise to me and
> > not something I've seen before. Fix the bug, move forward. Reverts are
> > a cultural admission of failure. I use threats of a revert as a hammer
> > to encourage people to pay attention to the bugs. I hardly ever
> > actually revert things. What does reverting their code say to my
> > submitters???
> 
> Huh. I guess I'm lucky never to have worked in a environment where that
> is the case. In fact, my experience is quite the opposite: revert first
> so that things get back to a working state and the developer/submitter
> has some breathing room to rework the broken code. It's actually fairly
> blameless if you get it right and when you have a half-functional CI it's
> pretty much a necessity. Anyway, I digress...

Fascinating, I am glad we had this discussion because I was pretty put
off by this talk of revert in this and the other thread. Cultural
differences!

Thanks,
Jason
Dmitry Baryshkov Feb. 13, 2024, 7:49 p.m. UTC | #9
On Tue, 13 Feb 2024 at 14:59, Will Deacon <will@kernel.org> wrote:
>
> On Tue, Feb 13, 2024 at 08:53:03AM -0400, Jason Gunthorpe wrote:
> > On Tue, Feb 13, 2024 at 12:19:34PM +0000, Will Deacon wrote:
> > > > > @@ -875,15 +879,6 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
> > > > >         mutex_init(&smmu_domain->init_mutex);
> > > > >         spin_lock_init(&smmu_domain->cb_lock);
> > > > >
> > > > > -       if (dev) {
> > > > > -               struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
> > > > > -
> > > > > -               if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) {
> > > > > -                       kfree(smmu_domain);
> > > > > -                       return NULL;
> > > > > -               }
> > > > > -       }
> > > > > -
> > > > >         return &smmu_domain->domain;
> > > > >  }
> > > >
> > > > Everything else is fine, you already tested with that arrangement.
> > >
> > > Partial reverts are a recipe for confusion, so I'll take this and if you'd
> > > like to bring back some of the hunks, please can you send a patch on top
> > > that does that?
> >
> > The typical kernel standard is to fix bugs in patches and only reach
> > for a wholesale revert if the community is struggling with bug
> > fixing. Dmitry already tested removing that hunk, Robin explained the
> > issue, we understand the bug fix is to remove the
> > arm_smmu_init_domain_context() call. Nothing justifies a full scale
> > revert.
>
> I can't say I'm aware of any consensus for how to handle this, to be
> completely honest with you. I just personally see a lot of benefit in
> reverting to a known-good state, especially when the revert has been
> posted by the bug reporter. Then we can add stuff on top of that known
> good state without having to worry about any other problems that we're
> yet to uncover. Doesn't really sound controversial...

Well, I'm open to any patch set that ends up fixing the issue. I won't
insist on landing the revert first, it's up to Will and Robin, I'd
say.

If there are any patches for this matter, please Cc me and
linux-arm-msm@, so that we can reply with the Tested-by trailer.

> > I'll send another patch if you want, but it seems like a waste of all
> > our time.
>
> It's a bug fix, of course it's a waste of time! We're talking minutes
> though, right?
>
> Will
Will Deacon Feb. 14, 2024, 5 p.m. UTC | #10
On Tue, 13 Feb 2024 13:31:56 +0200, Dmitry Baryshkov wrote:
> This reverts commit 9b3febc3a3da ("iommu/arm-smmu: Convert to
> domain_alloc_paging()"). It breaks Qualcomm MSM8996 platform. Calling
> arm_smmu_write_context_bank() from new codepath results in the platform
> being reset because of the unclocked hardware access.
> 
> 

Applied to will (for-joerg/arm-smmu/fixes), thanks!

[1/1] Revert "iommu/arm-smmu: Convert to domain_alloc_paging()"
      https://git.kernel.org/will/c/b419c5e2d978

Cheers,
diff mbox series

Patch

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 68b6bc5e7c71..6317aaf7b3ab 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -859,10 +859,14 @@  static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain)
 	arm_smmu_rpm_put(smmu);
 }
 
-static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
+static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 {
 	struct arm_smmu_domain *smmu_domain;
 
+	if (type != IOMMU_DOMAIN_UNMANAGED) {
+		if (using_legacy_binding || type != IOMMU_DOMAIN_DMA)
+			return NULL;
+	}
 	/*
 	 * Allocate the domain and initialise some of its data structures.
 	 * We can't really do anything meaningful until we've added a
@@ -875,15 +879,6 @@  static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
 	mutex_init(&smmu_domain->init_mutex);
 	spin_lock_init(&smmu_domain->cb_lock);
 
-	if (dev) {
-		struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
-
-		if (arm_smmu_init_domain_context(smmu_domain, cfg->smmu, dev)) {
-			kfree(smmu_domain);
-			return NULL;
-		}
-	}
-
 	return &smmu_domain->domain;
 }
 
@@ -1600,7 +1595,7 @@  static struct iommu_ops arm_smmu_ops = {
 	.identity_domain	= &arm_smmu_identity_domain,
 	.blocked_domain		= &arm_smmu_blocked_domain,
 	.capable		= arm_smmu_capable,
-	.domain_alloc_paging	= arm_smmu_domain_alloc_paging,
+	.domain_alloc		= arm_smmu_domain_alloc,
 	.probe_device		= arm_smmu_probe_device,
 	.release_device		= arm_smmu_release_device,
 	.probe_finalize		= arm_smmu_probe_finalize,