diff mbox series

x86/PCI: Convert force_disable_hpet() to standard quirk

Message ID 20201119181904.149129-1-helgaas@kernel.org (mailing list archive)
State Accepted, archived
Delegated to: Bjorn Helgaas
Headers show
Series x86/PCI: Convert force_disable_hpet() to standard quirk | expand

Commit Message

Bjorn Helgaas Nov. 19, 2020, 6:19 p.m. UTC
From: Bjorn Helgaas <bhelgaas@google.com>

62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
platform") implemented force_disable_hpet() as a special early quirk.
These run before the PCI core is initialized and depend on the
x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.

But force_disable_hpet() doesn't need to be one of these special early
quirks.  It merely sets "boot_hpet_disable", which is tested by
is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
hpet_enable() is an fs_initcall(), so it runs after the PCI core is
initialized.

Convert force_disable_hpet() to the standard PCI quirk mechanism.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
 arch/x86/kernel/early-quirks.c | 24 ------------------------
 arch/x86/pci/fixup.c           | 25 +++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 24 deletions(-)

Comments

Bjorn Helgaas Nov. 24, 2020, 11:27 p.m. UTC | #1
On Thu, Nov 19, 2020 at 12:19:04PM -0600, Bjorn Helgaas wrote:
> From: Bjorn Helgaas <bhelgaas@google.com>
> 
> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
> platform") implemented force_disable_hpet() as a special early quirk.
> These run before the PCI core is initialized and depend on the
> x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.
> 
> But force_disable_hpet() doesn't need to be one of these special early
> quirks.  It merely sets "boot_hpet_disable", which is tested by
> is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
> hpet_enable() is an fs_initcall(), so it runs after the PCI core is
> initialized.
> 
> Convert force_disable_hpet() to the standard PCI quirk mechanism.
> 
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> Cc: Feng Tang <feng.tang@intel.com>
> Cc: Kai-Heng Feng <kai.heng.feng@canonical.com>

I applied this to pci/misc for v5.11.  Let me know if you see any
issues.

> ---
>  arch/x86/kernel/early-quirks.c | 24 ------------------------
>  arch/x86/pci/fixup.c           | 25 +++++++++++++++++++++++++
>  2 files changed, 25 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
> index a4b5af03dcc1..674967fc1071 100644
> --- a/arch/x86/kernel/early-quirks.c
> +++ b/arch/x86/kernel/early-quirks.c
> @@ -604,14 +604,6 @@ static void __init intel_graphics_quirks(int num, int slot, int func)
>  	}
>  }
>  
> -static void __init force_disable_hpet(int num, int slot, int func)
> -{
> -#ifdef CONFIG_HPET_TIMER
> -	boot_hpet_disable = true;
> -	pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
> -#endif
> -}
> -
>  #define BCM4331_MMIO_SIZE	16384
>  #define BCM4331_PM_CAP		0x40
>  #define bcma_aread32(reg)	ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
> @@ -701,22 +693,6 @@ static struct chipset early_qrk[] __initdata = {
>  	  PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
>  	{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
>  	  QFLAG_APPLY_ONCE, intel_graphics_quirks },
> -	/*
> -	 * HPET on the current version of the Baytrail platform has accuracy
> -	 * problems: it will halt in deep idle state - so we disable it.
> -	 *
> -	 * More details can be found in section 18.10.1.3 of the datasheet:
> -	 *
> -	 *    http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
> -	 */
> -	{ PCI_VENDOR_ID_INTEL, 0x0f00,
> -		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> -	{ PCI_VENDOR_ID_INTEL, 0x3e20,
> -		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> -	{ PCI_VENDOR_ID_INTEL, 0x3ec4,
> -		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
> -	{ PCI_VENDOR_ID_INTEL, 0x8a12,
> -		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
>  	{ PCI_VENDOR_ID_BROADCOM, 0x4331,
>  	  PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
>  	{}
> diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
> index 0a0e168be1cb..865bc3c5188b 100644
> --- a/arch/x86/pci/fixup.c
> +++ b/arch/x86/pci/fixup.c
> @@ -780,3 +780,28 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
>  DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
>  
>  #endif
> +
> +/*
> + * HPET on the current version of the Baytrail platform has accuracy
> + * problems: it will halt in deep idle state - so we disable it.
> + *
> + * More details can be found in section 18.10.1.3 of the datasheet
> + * (Intel Document Number 332065-003, March 2016):
> + *
> + *    http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
> + */
> +static void force_disable_hpet(struct pci_dev *dev)
> +{
> +#ifdef CONFIG_HPET_TIMER
> +	boot_hpet_disable = true;
> +	pci_info(dev, "x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
> +#endif
> +}
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x0f00,
> +			      PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x3e20,
> +			      PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x3ec4,
> +			      PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
> +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x8a12,
> +			      PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
> -- 
> 2.25.1
>
Thomas Gleixner Nov. 25, 2020, 12:46 p.m. UTC | #2
On Thu, Nov 19 2020 at 12:19, Bjorn Helgaas wrote:
> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
> platform") implemented force_disable_hpet() as a special early quirk.
> These run before the PCI core is initialized and depend on the
> x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.
>
> But force_disable_hpet() doesn't need to be one of these special early
> quirks.  It merely sets "boot_hpet_disable", which is tested by
> is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
> hpet_enable() is an fs_initcall(), so it runs after the PCI core is
> initialized.

hpet_enable() is not an fs_initcall(). hpet_late_init() is and that
invokes hpet_enable() only for the case that ACPI did not advertise it
and the force_hpet quirk provided a base address.

But hpet_enable() is also invoked via:

 start_kernel()
   late_time_init()
     x86_late_time_init()
       hpet_time_init()

which is way before the PCI core is available and we really don't want
to set it up there if it's known to be broken :)

Now the more interesting question is why this needs to be a PCI quirk in
the first place. Can't we just disable the HPET based on family/model
quirks?

e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")
62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")

I might be missing something here, but in general on anything modern
HPET is mostly useless.

Thanks,

        tglx
Bjorn Helgaas Nov. 25, 2020, 7:13 p.m. UTC | #3
On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
> On Thu, Nov 19 2020 at 12:19, Bjorn Helgaas wrote:
> > 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
> > platform") implemented force_disable_hpet() as a special early quirk.
> > These run before the PCI core is initialized and depend on the
> > x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.
> >
> > But force_disable_hpet() doesn't need to be one of these special early
> > quirks.  It merely sets "boot_hpet_disable", which is tested by
> > is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
> > hpet_enable() is an fs_initcall(), so it runs after the PCI core is
> > initialized.
> 
> hpet_enable() is not an fs_initcall(). hpet_late_init() is and that
> invokes hpet_enable() only for the case that ACPI did not advertise it
> and the force_hpet quirk provided a base address.
> 
> But hpet_enable() is also invoked via:
> 
>  start_kernel()
>    late_time_init()
>      x86_late_time_init()
>        hpet_time_init()
> 
> which is way before the PCI core is available and we really don't want
> to set it up there if it's known to be broken :)

Wow, I really blew that, don't know how I missed that path.  Thanks
for catching this!  I'll drop this patch.

> Now the more interesting question is why this needs to be a PCI quirk in
> the first place. Can't we just disable the HPET based on family/model
> quirks?

You mean like a CPUID check or something?  I'm all in favor of doing
something that doesn't depend on PCI.

> e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
> f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
> fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")
> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")
> 
> I might be missing something here, but in general on anything modern
> HPET is mostly useless.
> 
> Thanks,
> 
>         tglx
>
Thomas Gleixner Nov. 26, 2020, 12:50 a.m. UTC | #4
On Wed, Nov 25 2020 at 13:13, Bjorn Helgaas wrote:
> On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
>> Now the more interesting question is why this needs to be a PCI quirk in
>> the first place. Can't we just disable the HPET based on family/model
>> quirks?
>
> You mean like a CPUID check or something?  I'm all in favor of doing
> something that doesn't depend on PCI.

The reason why these are PCI quirks is that the HPET is not part of the
CPU core. It's usually part of 00:00:0 (aka. host bridge) and the legacy
mess which resides there.

OTOH that thing is integrated into the actual chip nowadays and these
quirks are platform quirks, so the CPUID and the PCI vendor/device codes
should be the same for a particular model.

But what do I know. This whole platform/cpuid mess is inpenetrable even
for people who have access to the Intel internal documentation...

But, the point is that HPET does not provide any value on contemporary
CPUs assumed that Intel finally decided that TSC and the TSC deadline
timer are crucial system components along with the ability to get the
correct frequency of these beasts from firmware/cpuid.

So if parts of a particular chip generation, which we can determine by
CPUID (family, model) has issues with HPET, then there is no value in
preserving HPET for the N out of M submodels which are not affected even
if we can differentiate them by the host bridge device id.

But as I said above, what do I know. The Intel wizards which might have
better insight into this should speak up and come forth with objections.

Otherwise I just go and disable HPET support for the CPU models which
are covered by these PCI quirks today.

Ideally we can just disable it for anything more modern as well, but of
course that requires that future devices have proper frequency
enumeration and Intel prevents the OEMs to screw that up with their
"value add" BIOS/SMM machinery. Hope dies last...

Thanks,

        tglx
Feng Tang Nov. 26, 2020, 1:24 a.m. UTC | #5
Hi Thomas,

On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
> On Thu, Nov 19 2020 at 12:19, Bjorn Helgaas wrote:
> > 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail
> > platform") implemented force_disable_hpet() as a special early quirk.
> > These run before the PCI core is initialized and depend on the
> > x86/pci/early.c accessors that use I/O ports 0xcf8 and 0xcfc.
> >
> > But force_disable_hpet() doesn't need to be one of these special early
> > quirks.  It merely sets "boot_hpet_disable", which is tested by
> > is_hpet_capable(), which is only used by hpet_enable() and hpet_disable().
> > hpet_enable() is an fs_initcall(), so it runs after the PCI core is
> > initialized.
> 
> hpet_enable() is not an fs_initcall(). hpet_late_init() is and that
> invokes hpet_enable() only for the case that ACPI did not advertise it
> and the force_hpet quirk provided a base address.
> 
> But hpet_enable() is also invoked via:
> 
>  start_kernel()
>    late_time_init()
>      x86_late_time_init()
>        hpet_time_init()
> 
> which is way before the PCI core is available and we really don't want
> to set it up there if it's known to be broken :)
> 
> Now the more interesting question is why this needs to be a PCI quirk in
> the first place. Can't we just disable the HPET based on family/model
> quirks?
> 
> e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
> f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
> fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")



> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")
I added this commit, and I can explain some for Baytrail. There was
some discussion about the way to disable it:
https://lore.kernel.org/lkml/20140328073718.GA12762@feng-snb/t/

It uses PCI ID early quirk in the hope that later Baytrail stepping
doesn't have the problem. And later on, there was official document
(section 18.10.1.3 http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf)
stating Baytrail's HPET halts in deep idle. So I think your way of 
using CPUID to disable Baytrail HPET makes more sense.


> I might be missing something here, but in general on anything modern
> HPET is mostly useless.

IIUC, nowdays HPET's main use is as a clocksource watchdog monitor.
And in one debug case, we found it still useful. The debug platform has 
early serial console which prints many messages in early boot phase,
when the HPET is disabled, the software 'jiffies' clocksource will
be used as the monitor. Early printk will disable interrupt will
printing message, and this could be quite long for a slow 115200
device, and cause the periodic HW timer interrupt get missed, and
make the 'jiffies' clocksource not accurate, which will in turn
judge the TSC clocksrouce inaccurate, and disablt it. (Adding Rui,
Len for more details)

Thanks,
Feng


> Thanks,
> 
>         tglx
Thomas Gleixner Nov. 26, 2020, 11:27 p.m. UTC | #6
Feng,

On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
>> Now the more interesting question is why this needs to be a PCI quirk in
>> the first place. Can't we just disable the HPET based on family/model
>> quirks?
>> 
>> e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
>> f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
>> fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")
>> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")

> I added this commit, and I can explain some for Baytrail. There was
> some discussion about the way to disable it:
> https://lore.kernel.org/lkml/20140328073718.GA12762@feng-snb/t/
>
> It uses PCI ID early quirk in the hope that later Baytrail stepping
> doesn't have the problem. And later on, there was official document
> (section 18.10.1.3 http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf)
> stating Baytrail's HPET halts in deep idle. So I think your way of 
> using CPUID to disable Baytrail HPET makes more sense.

Correct.

>> I might be missing something here, but in general on anything modern
>> HPET is mostly useless.
>
> IIUC, nowdays HPET's main use is as a clocksource watchdog monitor.

Plus for the TSC refined calibration, where it is really better than
anything else we have _if_ it is functional.

> And in one debug case, we found it still useful. The debug platform has 
> early serial console which prints many messages in early boot phase,
> when the HPET is disabled, the software 'jiffies' clocksource will
> be used as the monitor. Early printk will disable interrupt will
> printing message, and this could be quite long for a slow 115200
> device, and cause the periodic HW timer interrupt get missed, and
> make the 'jiffies' clocksource not accurate, which will in turn
> judge the TSC clocksrouce inaccurate, and disablt it. (Adding Rui,
> Len for more details)

Yes, that can happen. But OTOH, we should start to think about the
requirements for using the TSC watchdog.

I'm inclined to lift that requirement when the CPU has:

    1) X86_FEATURE_CONSTANT_TSC
    2) X86_FEATURE_NONSTOP_TSC
    3) X86_FEATURE_NONSTOP_TSC_S3
    4) X86_FEATURE_TSC_ADJUST
    
    5) At max. 4 sockets

After two decades of horrors we're finally at a point where TSC seems to
be halfways reliable and less abused by BIOS tinkerers. TSC_ADJUST was
really key as we can now detect even small modifications reliably and
the important point is that we can cure them as well (not pretty but
better than all other options).

The only nasty one in the list above is #5 because AFAIK there is still
no architecural guarantee for TSCs being synchronized on machines with
more than 4 sockets. I might be wrong, but then nobody told me.

The only reason I hate to disable HPET upfront at least during boot is
that HPET is the best mechanism for the refined TSC calibration. PMTIMER
sucks because it's slow and wraps around pretty quick.

So we could do the following even on platforms where HPET stops in some
magic PC? state:

  - Register it during early boot as clocksource

  - Prevent the enablement as clockevent and the chardev hpet timer muck

  - Prevent the magic PC? state up to the point where the refined
    TSC calibration is finished.

  - Unregister it once the TSC has taken over as system clocksource and
    enable the magic PC? state in which HPET gets disfunctional.

Hmm?

Thanks,

        tglx
Feng Tang Nov. 27, 2020, 6:11 a.m. UTC | #7
Hi Thomas,

On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
> Feng,
> 
> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> > On Wed, Nov 25, 2020 at 01:46:23PM +0100, Thomas Gleixner wrote:
> >> Now the more interesting question is why this needs to be a PCI quirk in
> >> the first place. Can't we just disable the HPET based on family/model
> >> quirks?
> >> 
> >> e0748539e3d5 ("x86/intel: Disable HPET on Intel Ice Lake platforms")
> >> f8edbde885bb ("x86/intel: Disable HPET on Intel Coffee Lake H platforms")
> >> fc5db58539b4 ("x86/quirks: Disable HPET on Intel Coffe Lake platforms")
> >> 62187910b0fc ("x86/intel: Add quirk to disable HPET for the Baytrail platform")
> 
> > I added this commit, and I can explain some for Baytrail. There was
> > some discussion about the way to disable it:
> > https://lore.kernel.org/lkml/20140328073718.GA12762@feng-snb/t/
> >
> > It uses PCI ID early quirk in the hope that later Baytrail stepping
> > doesn't have the problem. And later on, there was official document
> > (section 18.10.1.3 http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf)
> > stating Baytrail's HPET halts in deep idle. So I think your way of 
> > using CPUID to disable Baytrail HPET makes more sense.
> 
> Correct.
> 
> >> I might be missing something here, but in general on anything modern
> >> HPET is mostly useless.
> >
> > IIUC, nowdays HPET's main use is as a clocksource watchdog monitor.
> 
> Plus for the TSC refined calibration, where it is really better than
> anything else we have _if_ it is functional.
> 
> > And in one debug case, we found it still useful. The debug platform has 
> > early serial console which prints many messages in early boot phase,
> > when the HPET is disabled, the software 'jiffies' clocksource will
> > be used as the monitor. Early printk will disable interrupt will
> > printing message, and this could be quite long for a slow 115200
> > device, and cause the periodic HW timer interrupt get missed, and
> > make the 'jiffies' clocksource not accurate, which will in turn
> > judge the TSC clocksrouce inaccurate, and disablt it. (Adding Rui,
> > Len for more details)
> 
> Yes, that can happen. But OTOH, we should start to think about the
> requirements for using the TSC watchdog.
> 
> I'm inclined to lift that requirement when the CPU has:
> 
>     1) X86_FEATURE_CONSTANT_TSC
>     2) X86_FEATURE_NONSTOP_TSC

>     3) X86_FEATURE_NONSTOP_TSC_S3
IIUC, this feature exists for several generations of Atom platforms,
and it is always coupled with 1) and 2), so it could be skipped for
the checking.
	
>     4) X86_FEATURE_TSC_ADJUST
>     
>     5) At max. 4 sockets
> 
> After two decades of horrors we're finally at a point where TSC seems to
> be halfways reliable and less abused by BIOS tinkerers. TSC_ADJUST was
> really key as we can now detect even small modifications reliably and
> the important point is that we can cure them as well (not pretty but
> better than all other options).
> 
> The only nasty one in the list above is #5 because AFAIK there is still
> no architecural guarantee for TSCs being synchronized on machines with
> more than 4 sockets. I might be wrong, but then nobody told me.
> 
> The only reason I hate to disable HPET upfront at least during boot is
> that HPET is the best mechanism for the refined TSC calibration. PMTIMER
> sucks because it's slow and wraps around pretty quick.
> 
> So we could do the following even on platforms where HPET stops in some
> magic PC? state:
> 
>   - Register it during early boot as clocksource
> 
>   - Prevent the enablement as clockevent and the chardev hpet timer muck
> 
>   - Prevent the magic PC? state up to the point where the refined
>     TSC calibration is finished.
> 
>   - Unregister it once the TSC has taken over as system clocksource and
>     enable the magic PC? state in which HPET gets disfunctional.

This looks reasonable to me. 

I have thought about lowering the hpet rating to lower than PMTIMER, so it
still contributes in early boot phase, and fades out after PMTIMER is
initialised.

Thanks,
Feng

> Hmm?
> 
> Thanks,
> 
>         tglx
> 
>
Thomas Gleixner Nov. 30, 2020, 7:21 p.m. UTC | #8
Feng,

On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
> On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
>> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
>> Yes, that can happen. But OTOH, we should start to think about the
>> requirements for using the TSC watchdog.
>> 
>> I'm inclined to lift that requirement when the CPU has:
>> 
>>     1) X86_FEATURE_CONSTANT_TSC
>>     2) X86_FEATURE_NONSTOP_TSC
>
>>     3) X86_FEATURE_NONSTOP_TSC_S3
> IIUC, this feature exists for several generations of Atom platforms,
> and it is always coupled with 1) and 2), so it could be skipped for
> the checking.

Yes, we can ignore that bit as it's not widely available and not
required to solve the problem.

>>     4) X86_FEATURE_TSC_ADJUST
>>     
>>     5) At max. 4 sockets
>> 
>> The only reason I hate to disable HPET upfront at least during boot is
>> that HPET is the best mechanism for the refined TSC calibration. PMTIMER
>> sucks because it's slow and wraps around pretty quick.
>> 
>> So we could do the following even on platforms where HPET stops in some
>> magic PC? state:
>> 
>>   - Register it during early boot as clocksource
>> 
>>   - Prevent the enablement as clockevent and the chardev hpet timer muck
>> 
>>   - Prevent the magic PC? state up to the point where the refined
>>     TSC calibration is finished.
>> 
>>   - Unregister it once the TSC has taken over as system clocksource and
>>     enable the magic PC? state in which HPET gets disfunctional.
>
> This looks reasonable to me. 
>
> I have thought about lowering the hpet rating to lower than PMTIMER, so it
> still contributes in early boot phase, and fades out after PMTIMER is
> initialised.

Not a good idea. pm_timer is initialized before the refined calibration
finishes.

Thanks,

        tglx
Feng Tang Dec. 1, 2020, 8:34 a.m. UTC | #9
Hi Thomas,

On Mon, Nov 30, 2020 at 08:21:03PM +0100, Thomas Gleixner wrote:
> Feng,
> 
> On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
> > On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
> >> On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> >> Yes, that can happen. But OTOH, we should start to think about the
> >> requirements for using the TSC watchdog.
> >> 
> >> I'm inclined to lift that requirement when the CPU has:
> >> 
> >>     1) X86_FEATURE_CONSTANT_TSC
> >>     2) X86_FEATURE_NONSTOP_TSC
> >
> >>     3) X86_FEATURE_NONSTOP_TSC_S3
> > IIUC, this feature exists for several generations of Atom platforms,
> > and it is always coupled with 1) and 2), so it could be skipped for
> > the checking.
> 
> Yes, we can ignore that bit as it's not widely available and not
> required to solve the problem.
> 
> >>     4) X86_FEATURE_TSC_ADJUST
> >>     
> >>     5) At max. 4 sockets
> >> 
> >> The only reason I hate to disable HPET upfront at least during boot is
> >> that HPET is the best mechanism for the refined TSC calibration. PMTIMER
> >> sucks because it's slow and wraps around pretty quick.
> >> 
> >> So we could do the following even on platforms where HPET stops in some
> >> magic PC? state:
> >> 
> >>   - Register it during early boot as clocksource
> >> 
> >>   - Prevent the enablement as clockevent and the chardev hpet timer muck
> >> 
> >>   - Prevent the magic PC? state up to the point where the refined
> >>     TSC calibration is finished.
> >> 
> >>   - Unregister it once the TSC has taken over as system clocksource and
> >>     enable the magic PC? state in which HPET gets disfunctional.
> >
> > This looks reasonable to me. 
> >
> > I have thought about lowering the hpet rating to lower than PMTIMER, so it
> > still contributes in early boot phase, and fades out after PMTIMER is
> > initialised.
> 
> Not a good idea. pm_timer is initialized before the refined calibration
> finishes.

Yes, you are right. I missed the part.

I dug some old notes, and found another old case (kernel 3.4) that a
broken PMTIMER as the watchdog clocksource wrongly judged TSC to be
unstable and disabled it, which makes me agree more to the idea of
"lift that requirement when the CPU has ..." 

If the TSC has those bits to garantee its reliability, then no need
to use a less reliable clocksource to monitor it.

Thanks,
Feng
Zhang, Rui Dec. 2, 2020, 7:28 a.m. UTC | #10
On Mon, 2020-11-30 at 20:21 +0100, Thomas Gleixner wrote:
> Feng,
> 
> On Fri, Nov 27 2020 at 14:11, Feng Tang wrote:
> > On Fri, Nov 27, 2020 at 12:27:34AM +0100, Thomas Gleixner wrote:
> > > On Thu, Nov 26 2020 at 09:24, Feng Tang wrote:
> > > Yes, that can happen. But OTOH, we should start to think about
> > > the
> > > requirements for using the TSC watchdog.

My original proposal is to disable jiffies and refined-jiffies as the
clocksource watchdog, because they are not reliable and it's better to
use clocksource that has a hardware counter as watchdog, like the patch
below, which I didn't sent out for upstream.

From cf9ce0ecab8851a3745edcad92e072022af3dbd9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 19 Jun 2020 22:03:23 +0800
Subject: [RFC PATCH] time/clocksource: do not use refined-jiffies as watchdog

On IA platforms, if HPET is disabled, either via x86 early-quirks, or
via kernel commandline, refined-jiffies will be used as clocksource
watchdog in early boot phase, before acpi_pm timer registered.

This is not a problem if jiffies are accurate.
But in some cases, for example, when serial console is enabled, it may
take several milliseconds to write to the console, with irq disabled,
frequently. Thus many ticks may become longer than it should be.

Using refined-jiffies as watchdog in this case breaks the system because
a) duration calculated by refined-jiffies watchdog is always consistent
   with the watchdog timeout issued using add_timer(), say, around 500ms.
b) duration calculated by the running clocksource, usually TSC on IA
   platforms, reflects the real time cost, which may be much larger.
This results in the running clocksource being disabled erroneously.

This is reproduced on ICL because HPET is disabled in x86 early-quirks,
and also reproduced on a KBL and a WHL platform when HPET is disabled
via command line.

BTW, commit fd329f276eca
("x86/mtrr: Skip cache flushes on CPUs with cache self-snooping") is
another example that refined-jiffies causes the same problem when ticks
become slow for some other reason.

IMO, the right solution is to only use hardware clocksource as watchdog.
Then even if ticks are slow, both the running clocksource and the watchdog
returns real time cost, and they still match.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 kernel/time/clocksource.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 02441ead3c3b..e7e703858fa6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -364,6 +364,10 @@ static void clocksource_select_watchdog(bool fallback)
 		watchdog = NULL;
 
 	list_for_each_entry(cs, &clocksource_list, list) {
+		/* Do not use refined-jiffies as clocksource watchdog */
+		if (cs->rating <= 2)
+			continue;
+
 		/* cs is a clocksource to be watched. */
 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
 			continue;
diff mbox series

Patch

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a4b5af03dcc1..674967fc1071 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -604,14 +604,6 @@  static void __init intel_graphics_quirks(int num, int slot, int func)
 	}
 }
 
-static void __init force_disable_hpet(int num, int slot, int func)
-{
-#ifdef CONFIG_HPET_TIMER
-	boot_hpet_disable = true;
-	pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
-#endif
-}
-
 #define BCM4331_MMIO_SIZE	16384
 #define BCM4331_PM_CAP		0x40
 #define bcma_aread32(reg)	ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
@@ -701,22 +693,6 @@  static struct chipset early_qrk[] __initdata = {
 	  PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
 	{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
 	  QFLAG_APPLY_ONCE, intel_graphics_quirks },
-	/*
-	 * HPET on the current version of the Baytrail platform has accuracy
-	 * problems: it will halt in deep idle state - so we disable it.
-	 *
-	 * More details can be found in section 18.10.1.3 of the datasheet:
-	 *
-	 *    http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
-	 */
-	{ PCI_VENDOR_ID_INTEL, 0x0f00,
-		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
-	{ PCI_VENDOR_ID_INTEL, 0x3e20,
-		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
-	{ PCI_VENDOR_ID_INTEL, 0x3ec4,
-		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
-	{ PCI_VENDOR_ID_INTEL, 0x8a12,
-		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
 	{ PCI_VENDOR_ID_BROADCOM, 0x4331,
 	  PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
 	{}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 0a0e168be1cb..865bc3c5188b 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -780,3 +780,28 @@  DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
 DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
 
 #endif
+
+/*
+ * HPET on the current version of the Baytrail platform has accuracy
+ * problems: it will halt in deep idle state - so we disable it.
+ *
+ * More details can be found in section 18.10.1.3 of the datasheet
+ * (Intel Document Number 332065-003, March 2016):
+ *
+ *    http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
+ */
+static void force_disable_hpet(struct pci_dev *dev)
+{
+#ifdef CONFIG_HPET_TIMER
+	boot_hpet_disable = true;
+	pci_info(dev, "x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
+#endif
+}
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x0f00,
+			      PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x3e20,
+			      PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x3ec4,
+			      PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, 0x8a12,
+			      PCI_CLASS_BRIDGE_HOST, 8, force_disable_hpet);